In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [5]:
nltk.download("punkt")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/divesh_pandey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[5]:
True

Getting and cleaning data


In [6]:
hindi_filenames = sorted(glob.glob("../data/hin_corp_unicode/*txt"))
#hindi_filenames

In [7]:
corpus_raw = u""
for file_name in hindi_filenames:
    print("Reading '{0}'...".format(file_name))
    with codecs.open(file_name, "r", "utf-8") as f:
        # Starting two lines are not useful in corpus
        temp = f.readline()
        temp = f.readline()
        corpus_raw += f.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()


Reading '../data/hin_corp_unicode/1000_utf.txt'...
Corpus is now 15764 characters long

Reading '../data/hin_corp_unicode/1001_utf.txt'...
Corpus is now 33663 characters long

Reading '../data/hin_corp_unicode/1002_utf.txt'...
Corpus is now 48153 characters long

Reading '../data/hin_corp_unicode/1003_utf.txt'...
Corpus is now 63362 characters long

Reading '../data/hin_corp_unicode/1004_utf.txt'...
Corpus is now 77899 characters long

Reading '../data/hin_corp_unicode/1005_utf.txt'...
Corpus is now 95324 characters long

Reading '../data/hin_corp_unicode/1006_utf.txt'...
Corpus is now 106578 characters long

Reading '../data/hin_corp_unicode/1007_utf.txt'...
Corpus is now 118142 characters long

Reading '../data/hin_corp_unicode/1008_utf.txt'...
Corpus is now 132864 characters long

Reading '../data/hin_corp_unicode/1009_utf.txt'...
Corpus is now 144821 characters long

Reading '../data/hin_corp_unicode/100_utf.txt'...
Corpus is now 154593 characters long

Reading '../data/hin_corp_unicode/1010_utf.txt'...
Corpus is now 170781 characters long

Reading '../data/hin_corp_unicode/1011_utf.txt'...
Corpus is now 185033 characters long

Reading '../data/hin_corp_unicode/1012_utf.txt'...
Corpus is now 201858 characters long

Reading '../data/hin_corp_unicode/1013_utf.txt'...
Corpus is now 212884 characters long

Reading '../data/hin_corp_unicode/1014_utf.txt'...
Corpus is now 226149 characters long

Reading '../data/hin_corp_unicode/1015_utf.txt'...
Corpus is now 237890 characters long

Reading '../data/hin_corp_unicode/1016_utf.txt'...
Corpus is now 253906 characters long

Reading '../data/hin_corp_unicode/1017_utf.txt'...
Corpus is now 266044 characters long

Reading '../data/hin_corp_unicode/1018_utf.txt'...
Corpus is now 289256 characters long

Reading '../data/hin_corp_unicode/1019_utf.txt'...
Corpus is now 307677 characters long

Reading '../data/hin_corp_unicode/101_utf.txt'...
Corpus is now 319262 characters long

Reading '../data/hin_corp_unicode/1020_utf.txt'...
Corpus is now 333035 characters long

Reading '../data/hin_corp_unicode/1021_utf.txt'...
Corpus is now 345398 characters long

Reading '../data/hin_corp_unicode/1022_utf.txt'...
Corpus is now 364372 characters long

Reading '../data/hin_corp_unicode/1023_utf.txt'...
Corpus is now 374942 characters long

Reading '../data/hin_corp_unicode/1024_utf.txt'...
Corpus is now 389504 characters long

Reading '../data/hin_corp_unicode/1025_utf.txt'...
Corpus is now 402124 characters long

Reading '../data/hin_corp_unicode/1026_utf.txt'...
Corpus is now 415294 characters long

Reading '../data/hin_corp_unicode/1027_utf.txt'...
Corpus is now 429118 characters long

Reading '../data/hin_corp_unicode/1028_utf.txt'...
Corpus is now 441083 characters long

Reading '../data/hin_corp_unicode/1029_utf.txt'...
Corpus is now 452915 characters long

Reading '../data/hin_corp_unicode/102_utf.txt'...
Corpus is now 463474 characters long

Reading '../data/hin_corp_unicode/1030_utf.txt'...
Corpus is now 476956 characters long

Reading '../data/hin_corp_unicode/1031_utf.txt'...
Corpus is now 490632 characters long

Reading '../data/hin_corp_unicode/1032_utf.txt'...
Corpus is now 502763 characters long

Reading '../data/hin_corp_unicode/1033_utf.txt'...
Corpus is now 516883 characters long

Reading '../data/hin_corp_unicode/1034_utf.txt'...
Corpus is now 529055 characters long

Reading '../data/hin_corp_unicode/1035_utf.txt'...
Corpus is now 542161 characters long

Reading '../data/hin_corp_unicode/1036_utf.txt'...
Corpus is now 558328 characters long

Reading '../data/hin_corp_unicode/1037_utf.txt'...
Corpus is now 577599 characters long

Reading '../data/hin_corp_unicode/1038_utf.txt'...
Corpus is now 592185 characters long

Reading '../data/hin_corp_unicode/1039_utf.txt'...
Corpus is now 603482 characters long

Reading '../data/hin_corp_unicode/103_utf.txt'...
Corpus is now 615778 characters long

Reading '../data/hin_corp_unicode/1040_utf.txt'...
Corpus is now 627410 characters long

Reading '../data/hin_corp_unicode/1041_utf.txt'...
Corpus is now 639794 characters long

Reading '../data/hin_corp_unicode/1042_utf.txt'...
Corpus is now 655508 characters long

Reading '../data/hin_corp_unicode/1043_utf.txt'...
Corpus is now 671222 characters long

Reading '../data/hin_corp_unicode/1044_utf.txt'...
Corpus is now 680037 characters long

Reading '../data/hin_corp_unicode/1045_utf.txt'...
Corpus is now 689238 characters long

Reading '../data/hin_corp_unicode/1046_utf.txt'...
Corpus is now 703815 characters long

Reading '../data/hin_corp_unicode/1047_utf.txt'...
Corpus is now 714513 characters long

Reading '../data/hin_corp_unicode/1048_utf.txt'...
Corpus is now 725701 characters long

Reading '../data/hin_corp_unicode/1049_utf.txt'...
Corpus is now 735080 characters long

Reading '../data/hin_corp_unicode/104_utf.txt'...
Corpus is now 744998 characters long

Reading '../data/hin_corp_unicode/1050_utf.txt'...
Corpus is now 755339 characters long

Reading '../data/hin_corp_unicode/1051_utf.txt'...
Corpus is now 767986 characters long

Reading '../data/hin_corp_unicode/1052_utf.txt'...
Corpus is now 779144 characters long

Reading '../data/hin_corp_unicode/1053_utf.txt'...
Corpus is now 789303 characters long

Reading '../data/hin_corp_unicode/1054_utf.txt'...
Corpus is now 804172 characters long

Reading '../data/hin_corp_unicode/1055_utf.txt'...
Corpus is now 817688 characters long

Reading '../data/hin_corp_unicode/1056_utf.txt'...
Corpus is now 833523 characters long

Reading '../data/hin_corp_unicode/1057_utf.txt'...
Corpus is now 846478 characters long

Reading '../data/hin_corp_unicode/1058_utf.txt'...
Corpus is now 859991 characters long

Reading '../data/hin_corp_unicode/1059_utf.txt'...
Corpus is now 859991 characters long

Reading '../data/hin_corp_unicode/105_utf.txt'...
Corpus is now 870258 characters long

Reading '../data/hin_corp_unicode/1060_utf.txt'...
Corpus is now 882560 characters long

Reading '../data/hin_corp_unicode/1061_utf.txt'...
Corpus is now 889832 characters long

Reading '../data/hin_corp_unicode/1062_utf.txt'...
Corpus is now 906535 characters long

Reading '../data/hin_corp_unicode/1063_utf.txt'...
Corpus is now 919811 characters long

Reading '../data/hin_corp_unicode/1064_utf.txt'...
Corpus is now 933032 characters long

Reading '../data/hin_corp_unicode/1065_utf.txt'...
Corpus is now 947554 characters long

Reading '../data/hin_corp_unicode/1066_utf.txt'...
Corpus is now 963606 characters long

Reading '../data/hin_corp_unicode/1067_utf.txt'...
Corpus is now 975799 characters long

Reading '../data/hin_corp_unicode/1068_utf.txt'...
Corpus is now 987608 characters long

Reading '../data/hin_corp_unicode/1069_utf.txt'...
Corpus is now 1000161 characters long

Reading '../data/hin_corp_unicode/106_utf.txt'...
Corpus is now 1005735 characters long

Reading '../data/hin_corp_unicode/1070_utf.txt'...
Corpus is now 1017269 characters long

Reading '../data/hin_corp_unicode/1071_utf.txt'...
Corpus is now 1028839 characters long

Reading '../data/hin_corp_unicode/1072_utf.txt'...
Corpus is now 1045725 characters long

Reading '../data/hin_corp_unicode/1073_utf.txt'...
Corpus is now 1057589 characters long

Reading '../data/hin_corp_unicode/1074_utf.txt'...
Corpus is now 1071326 characters long

Reading '../data/hin_corp_unicode/1075_utf.txt'...
Corpus is now 1084484 characters long

Reading '../data/hin_corp_unicode/1076_utf.txt'...
Corpus is now 1097388 characters long

Reading '../data/hin_corp_unicode/1077_utf.txt'...
Corpus is now 1111360 characters long

Reading '../data/hin_corp_unicode/1078_utf.txt'...
Corpus is now 1245315 characters long

Reading '../data/hin_corp_unicode/1079_utf.txt'...
Corpus is now 1254499 characters long

Reading '../data/hin_corp_unicode/107_utf.txt'...
Corpus is now 1262657 characters long

Reading '../data/hin_corp_unicode/1080_utf.txt'...
Corpus is now 1272779 characters long

Reading '../data/hin_corp_unicode/1081_utf.txt'...
Corpus is now 1284285 characters long

Reading '../data/hin_corp_unicode/1082_utf.txt'...
Corpus is now 1294212 characters long

Reading '../data/hin_corp_unicode/1083_utf.txt'...
Corpus is now 1299936 characters long

Reading '../data/hin_corp_unicode/1084_utf.txt'...
Corpus is now 1322878 characters long

Reading '../data/hin_corp_unicode/1085_utf.txt'...
Corpus is now 1339027 characters long

Reading '../data/hin_corp_unicode/1086_utf.txt'...
Corpus is now 1358000 characters long

Reading '../data/hin_corp_unicode/1087_utf.txt'...
Corpus is now 1371126 characters long

Reading '../data/hin_corp_unicode/1088_utf.txt'...
Corpus is now 1378603 characters long

Reading '../data/hin_corp_unicode/1089_utf.txt'...
Corpus is now 1386042 characters long

Reading '../data/hin_corp_unicode/108_utf.txt'...
Corpus is now 1397305 characters long

Reading '../data/hin_corp_unicode/1090_utf.txt'...
Corpus is now 1409499 characters long

Reading '../data/hin_corp_unicode/1091_utf.txt'...
Corpus is now 1422868 characters long

Reading '../data/hin_corp_unicode/1092_utf.txt'...
Corpus is now 1434318 characters long

Reading '../data/hin_corp_unicode/1093_utf.txt'...
Corpus is now 1446496 characters long

Reading '../data/hin_corp_unicode/1094_utf.txt'...
Corpus is now 1458628 characters long

Reading '../data/hin_corp_unicode/1095_utf.txt'...
Corpus is now 1470424 characters long

Reading '../data/hin_corp_unicode/1096_utf.txt'...
Corpus is now 1479903 characters long

Reading '../data/hin_corp_unicode/1097_utf.txt'...
Corpus is now 1489477 characters long

Reading '../data/hin_corp_unicode/1098_utf.txt'...
Corpus is now 1498965 characters long

Reading '../data/hin_corp_unicode/1099_utf.txt'...
Corpus is now 1508966 characters long

Reading '../data/hin_corp_unicode/109_utf.txt'...
Corpus is now 1517681 characters long

Reading '../data/hin_corp_unicode/10_utf8.txt'...
Corpus is now 1517681 characters long

Reading '../data/hin_corp_unicode/1100_utf.txt'...
Corpus is now 1526753 characters long

Reading '../data/hin_corp_unicode/1101_utf.txt'...
Corpus is now 1535995 characters long

Reading '../data/hin_corp_unicode/1102_utf.txt'...
Corpus is now 1548699 characters long

Reading '../data/hin_corp_unicode/1103_utf.txt'...
Corpus is now 1559247 characters long

Reading '../data/hin_corp_unicode/1104_utf.txt'...
Corpus is now 1571989 characters long

Reading '../data/hin_corp_unicode/1105_utf.txt'...
Corpus is now 1581501 characters long

Reading '../data/hin_corp_unicode/1106_utf.txt'...
Corpus is now 1592379 characters long

Reading '../data/hin_corp_unicode/1107_utf.txt'...
Corpus is now 1602546 characters long

Reading '../data/hin_corp_unicode/1108_utf.txt'...
Corpus is now 1614202 characters long

Reading '../data/hin_corp_unicode/1109_utf.txt'...
Corpus is now 1627371 characters long

Reading '../data/hin_corp_unicode/110_utf.txt'...
Corpus is now 1637978 characters long

Reading '../data/hin_corp_unicode/1110_utf.txt'...
Corpus is now 1650751 characters long

Reading '../data/hin_corp_unicode/1111_utf.txt'...
Corpus is now 1660708 characters long

Reading '../data/hin_corp_unicode/1112_utf.txt'...
Corpus is now 1672087 characters long

Reading '../data/hin_corp_unicode/1113_utf.txt'...
Corpus is now 1683636 characters long

Reading '../data/hin_corp_unicode/1114_utf.txt'...
Corpus is now 1697858 characters long

Reading '../data/hin_corp_unicode/1115_utf.txt'...
Corpus is now 1712990 characters long

Reading '../data/hin_corp_unicode/1116_utf.txt'...
Corpus is now 1727920 characters long

Reading '../data/hin_corp_unicode/1117_utf.txt'...
Corpus is now 1738544 characters long

Reading '../data/hin_corp_unicode/1118_utf.txt'...
Corpus is now 1748863 characters long

Reading '../data/hin_corp_unicode/1119_utf.txt'...
Corpus is now 1761075 characters long

Reading '../data/hin_corp_unicode/111_utf.txt'...
Corpus is now 1773923 characters long

Reading '../data/hin_corp_unicode/1120_utf.txt'...
Corpus is now 1786706 characters long

Reading '../data/hin_corp_unicode/1121_utf.txt'...
Corpus is now 1803759 characters long

Reading '../data/hin_corp_unicode/1122_utf.txt'...
Corpus is now 1816351 characters long

Reading '../data/hin_corp_unicode/1123_utf.txt'...
Corpus is now 1830221 characters long

Reading '../data/hin_corp_unicode/1124_utf.txt'...
Corpus is now 1845435 characters long

Reading '../data/hin_corp_unicode/1125_utf.txt'...
Corpus is now 1859781 characters long

Reading '../data/hin_corp_unicode/1126_utf.txt'...
Corpus is now 1873232 characters long

Reading '../data/hin_corp_unicode/1127_utf.txt'...
Corpus is now 1886498 characters long

Reading '../data/hin_corp_unicode/1128_utf.txt'...
Corpus is now 1902306 characters long

Reading '../data/hin_corp_unicode/1129_utf.txt'...
Corpus is now 1913924 characters long

Reading '../data/hin_corp_unicode/112_utf.txt'...
Corpus is now 1924358 characters long

Reading '../data/hin_corp_unicode/1130_utf.txt'...
Corpus is now 1935669 characters long

Reading '../data/hin_corp_unicode/1131_utf.txt'...
Corpus is now 1945838 characters long

Reading '../data/hin_corp_unicode/1132_utf.txt'...
Corpus is now 1956502 characters long

Reading '../data/hin_corp_unicode/1133_utf.txt'...
Corpus is now 1966760 characters long

Reading '../data/hin_corp_unicode/1134_utf.txt'...
Corpus is now 1978041 characters long

Reading '../data/hin_corp_unicode/1135_utf.txt'...
Corpus is now 1999388 characters long

Reading '../data/hin_corp_unicode/1136_utf.txt'...
Corpus is now 2009544 characters long

Reading '../data/hin_corp_unicode/1137_utf.txt'...
Corpus is now 2020061 characters long

Reading '../data/hin_corp_unicode/1138_utf.txt'...
Corpus is now 2031542 characters long

Reading '../data/hin_corp_unicode/1139_utf.txt'...
Corpus is now 2041658 characters long

Reading '../data/hin_corp_unicode/113_utf.txt'...
Corpus is now 2054635 characters long

Reading '../data/hin_corp_unicode/1140_utf.txt'...
Corpus is now 2064671 characters long

Reading '../data/hin_corp_unicode/1141_utf.txt'...
Corpus is now 2074453 characters long

Reading '../data/hin_corp_unicode/1142_utf.txt'...
Corpus is now 2084399 characters long

Reading '../data/hin_corp_unicode/1143_utf.txt'...
Corpus is now 2097400 characters long

Reading '../data/hin_corp_unicode/1144_utf.txt'...
Corpus is now 2107890 characters long

Reading '../data/hin_corp_unicode/1145_utf.txt'...
Corpus is now 2117922 characters long

Reading '../data/hin_corp_unicode/1146_utf.txt'...
Corpus is now 2127483 characters long

Reading '../data/hin_corp_unicode/1147_utf.txt'...
Corpus is now 2138945 characters long

Reading '../data/hin_corp_unicode/1148_utf.txt'...
Corpus is now 2149981 characters long

Reading '../data/hin_corp_unicode/1149_utf.txt'...
Corpus is now 2157493 characters long

Reading '../data/hin_corp_unicode/114_utf.txt'...
Corpus is now 2168073 characters long

Reading '../data/hin_corp_unicode/1150_utf.txt'...
Corpus is now 2180893 characters long

Reading '../data/hin_corp_unicode/1151_utf.txt'...
Corpus is now 2194251 characters long

Reading '../data/hin_corp_unicode/1152_utf.txt'...
Corpus is now 2204388 characters long

Reading '../data/hin_corp_unicode/1153_utf.txt'...
Corpus is now 2214759 characters long

Reading '../data/hin_corp_unicode/1154_utf.txt'...
Corpus is now 2225483 characters long

Reading '../data/hin_corp_unicode/1155_utf.txt'...
Corpus is now 2235061 characters long

Reading '../data/hin_corp_unicode/1156_utf.txt'...
Corpus is now 2246873 characters long

Reading '../data/hin_corp_unicode/1157_utf.txt'...
Corpus is now 2270926 characters long

Reading '../data/hin_corp_unicode/1158_utf.txt'...
Corpus is now 2278405 characters long

Reading '../data/hin_corp_unicode/115_utf.txt'...
Corpus is now 2288118 characters long

Reading '../data/hin_corp_unicode/1160_utf.txt'...
Corpus is now 2292344 characters long

Reading '../data/hin_corp_unicode/1161_utf.txt'...
Corpus is now 2302999 characters long

Reading '../data/hin_corp_unicode/1162_utf.txt'...
Corpus is now 2312185 characters long

Reading '../data/hin_corp_unicode/1163_utf.txt'...
Corpus is now 2320191 characters long

Reading '../data/hin_corp_unicode/1164_utf.txt'...
Corpus is now 2329773 characters long

Reading '../data/hin_corp_unicode/1165_utf.txt'...
Corpus is now 2337208 characters long

Reading '../data/hin_corp_unicode/1166_utf.txt'...
Corpus is now 2347525 characters long

Reading '../data/hin_corp_unicode/1167_utf.txt'...
Corpus is now 2356006 characters long

Reading '../data/hin_corp_unicode/1168_utf.txt'...
Corpus is now 2365511 characters long

Reading '../data/hin_corp_unicode/1169_utf.txt'...
Corpus is now 2378250 characters long

Reading '../data/hin_corp_unicode/116_utf.txt'...
Corpus is now 2388467 characters long

Reading '../data/hin_corp_unicode/1170_utf.txt'...
Corpus is now 2397501 characters long

Reading '../data/hin_corp_unicode/1171_utf.txt'...
Corpus is now 2407281 characters long

Reading '../data/hin_corp_unicode/1172_utf.txt'...
Corpus is now 2416850 characters long

Reading '../data/hin_corp_unicode/1173_utf.txt'...
Corpus is now 2427732 characters long

Reading '../data/hin_corp_unicode/1174_utf.txt'...
Corpus is now 2436884 characters long

Reading '../data/hin_corp_unicode/1175_utf.txt'...
Corpus is now 2444844 characters long

Reading '../data/hin_corp_unicode/1176_utf.txt'...
Corpus is now 2455852 characters long

Reading '../data/hin_corp_unicode/1177_utf.txt'...
Corpus is now 2463769 characters long

Reading '../data/hin_corp_unicode/1178_utf.txt'...
Corpus is now 2476557 characters long

Reading '../data/hin_corp_unicode/1179_utf.txt'...
Corpus is now 2476557 characters long

Reading '../data/hin_corp_unicode/117_utf.txt'...
Corpus is now 2484876 characters long

Reading '../data/hin_corp_unicode/1180_utf.txt'...
Corpus is now 2484876 characters long

Reading '../data/hin_corp_unicode/1181_utf.txt'...
Corpus is now 2492120 characters long

Reading '../data/hin_corp_unicode/1182_utf.txt'...
Corpus is now 2503852 characters long

Reading '../data/hin_corp_unicode/1183_utf.txt'...
Corpus is now 2513778 characters long

Reading '../data/hin_corp_unicode/1184_utf.txt'...
Corpus is now 2522787 characters long

Reading '../data/hin_corp_unicode/1185_utf.txt'...
Corpus is now 2533197 characters long

Reading '../data/hin_corp_unicode/1186_utf.txt'...
Corpus is now 2541103 characters long

Reading '../data/hin_corp_unicode/1187_utf.txt'...
Corpus is now 2552220 characters long

Reading '../data/hin_corp_unicode/1188_utf.txt'...
Corpus is now 2558627 characters long

Reading '../data/hin_corp_unicode/1189_utf.txt'...
Corpus is now 2568593 characters long

Reading '../data/hin_corp_unicode/118_utf.txt'...
Corpus is now 2577837 characters long

Reading '../data/hin_corp_unicode/1190_utf.txt'...
Corpus is now 2577837 characters long

Reading '../data/hin_corp_unicode/1191_utf.txt'...
Corpus is now 2589740 characters long

Reading '../data/hin_corp_unicode/1192_utf.txt'...
Corpus is now 2600457 characters long

Reading '../data/hin_corp_unicode/1193_utf.txt'...
Corpus is now 2612320 characters long

Reading '../data/hin_corp_unicode/1194_utf.txt'...
Corpus is now 2624351 characters long

Reading '../data/hin_corp_unicode/1195_utf.txt'...
Corpus is now 2635716 characters long

Reading '../data/hin_corp_unicode/1196_utf.txt'...
Corpus is now 2648028 characters long

Reading '../data/hin_corp_unicode/1197_utf.txt'...
Corpus is now 2654171 characters long

Reading '../data/hin_corp_unicode/1198_utf.txt'...
Corpus is now 2665069 characters long

Reading '../data/hin_corp_unicode/1199_utf.txt'...
Corpus is now 2675262 characters long

Reading '../data/hin_corp_unicode/119_utf.txt'...
Corpus is now 2686391 characters long

Reading '../data/hin_corp_unicode/11_utf8.txt'...
Corpus is now 2686391 characters long

Reading '../data/hin_corp_unicode/1200_utf.txt'...
Corpus is now 2697083 characters long

Reading '../data/hin_corp_unicode/1201_utf.txt'...
Corpus is now 2750848 characters long

Reading '../data/hin_corp_unicode/1202_utf.txt'...
Corpus is now 2761018 characters long

Reading '../data/hin_corp_unicode/1203_utf.txt'...
Corpus is now 2770808 characters long

Reading '../data/hin_corp_unicode/1204_utf.txt'...
Corpus is now 2783324 characters long

Reading '../data/hin_corp_unicode/1205_utf.txt'...
Corpus is now 2802845 characters long

Reading '../data/hin_corp_unicode/1206_utf.txt'...
Corpus is now 2816520 characters long

Reading '../data/hin_corp_unicode/1207_utf.txt'...
Corpus is now 2827395 characters long

Reading '../data/hin_corp_unicode/1208_utf.txt'...
Corpus is now 2840445 characters long

Reading '../data/hin_corp_unicode/1209_utf.txt'...
Corpus is now 2848265 characters long

Reading '../data/hin_corp_unicode/120_utf.txt'...
Corpus is now 2858811 characters long

Reading '../data/hin_corp_unicode/1210_utf.txt'...
Corpus is now 2874289 characters long

Reading '../data/hin_corp_unicode/1211_utf.txt'...
Corpus is now 2885632 characters long

Reading '../data/hin_corp_unicode/1212_utf.txt'...
Corpus is now 2893405 characters long

Reading '../data/hin_corp_unicode/1213_utf.txt'...
Corpus is now 2907759 characters long

Reading '../data/hin_corp_unicode/1214_utf.txt'...
Corpus is now 2918669 characters long

Reading '../data/hin_corp_unicode/1215_utf.txt'...
Corpus is now 2931345 characters long

Reading '../data/hin_corp_unicode/1216_utf.txt'...
Corpus is now 2943983 characters long

Reading '../data/hin_corp_unicode/1217_utf.txt'...
Corpus is now 2953782 characters long

Reading '../data/hin_corp_unicode/1218_utf.txt'...
Corpus is now 2965592 characters long

Reading '../data/hin_corp_unicode/1219_utf.txt'...
Corpus is now 2977052 characters long

Reading '../data/hin_corp_unicode/121_utf.txt'...
Corpus is now 2983707 characters long

Reading '../data/hin_corp_unicode/1220_utf.txt'...
Corpus is now 2994859 characters long

Reading '../data/hin_corp_unicode/1221_utf.txt'...
Corpus is now 3003901 characters long

Reading '../data/hin_corp_unicode/1222_utf.txt'...
Corpus is now 3017694 characters long

Reading '../data/hin_corp_unicode/1223_utf.txt'...
Corpus is now 3030022 characters long

Reading '../data/hin_corp_unicode/1224_utf.txt'...
Corpus is now 3040170 characters long

Reading '../data/hin_corp_unicode/1225_utf.txt'...
Corpus is now 3050266 characters long

Reading '../data/hin_corp_unicode/1226_utf.txt'...
Corpus is now 3059827 characters long

Reading '../data/hin_corp_unicode/1227_utf.txt'...
Corpus is now 3081787 characters long

Reading '../data/hin_corp_unicode/1228_utf.txt'...
Corpus is now 3093071 characters long

Reading '../data/hin_corp_unicode/1229_utf.txt'...
Corpus is now 3103477 characters long

Reading '../data/hin_corp_unicode/122_utf.txt'...
Corpus is now 3113532 characters long

Reading '../data/hin_corp_unicode/1230_utf.txt'...
Corpus is now 3126208 characters long

Reading '../data/hin_corp_unicode/1231_utf.txt'...
Corpus is now 3136430 characters long

Reading '../data/hin_corp_unicode/1232_utf.txt'...
Corpus is now 3146958 characters long

Reading '../data/hin_corp_unicode/1233_utf.txt'...
Corpus is now 3157456 characters long

Reading '../data/hin_corp_unicode/123_utf.txt'...
Corpus is now 3169580 characters long

Reading '../data/hin_corp_unicode/124_utf.txt'...
Corpus is now 3179593 characters long

Reading '../data/hin_corp_unicode/125_utf.txt'...
Corpus is now 3187626 characters long

Reading '../data/hin_corp_unicode/126_utf.txt'...
Corpus is now 3199528 characters long

Reading '../data/hin_corp_unicode/127_utf.txt'...
Corpus is now 3211722 characters long

Reading '../data/hin_corp_unicode/128_utf.txt'...
Corpus is now 3223003 characters long

Reading '../data/hin_corp_unicode/129_utf.txt'...
Corpus is now 3234838 characters long

Reading '../data/hin_corp_unicode/12_utf8.txt'...
Corpus is now 3234838 characters long

Reading '../data/hin_corp_unicode/130_utf.txt'...
Corpus is now 3246268 characters long

Reading '../data/hin_corp_unicode/131_utf.txt'...
Corpus is now 3259267 characters long

Reading '../data/hin_corp_unicode/132_utf.txt'...
Corpus is now 3268394 characters long

Reading '../data/hin_corp_unicode/133_utf.txt'...
Corpus is now 3279467 characters long

Reading '../data/hin_corp_unicode/134_utf.txt'...
Corpus is now 3290481 characters long

Reading '../data/hin_corp_unicode/135_utf.txt'...
Corpus is now 3299738 characters long

Reading '../data/hin_corp_unicode/136_utf.txt'...
Corpus is now 3312265 characters long

Reading '../data/hin_corp_unicode/137_utf.txt'...
Corpus is now 3323398 characters long

Reading '../data/hin_corp_unicode/138_utf.txt'...
Corpus is now 3334686 characters long

Reading '../data/hin_corp_unicode/139_utf.txt'...
Corpus is now 3346195 characters long

Reading '../data/hin_corp_unicode/13_utf8.txt'...
Corpus is now 3346195 characters long

Reading '../data/hin_corp_unicode/140_utf.txt'...
Corpus is now 3356806 characters long

Reading '../data/hin_corp_unicode/141_utf.txt'...
Corpus is now 3368547 characters long

Reading '../data/hin_corp_unicode/142_utf.txt'...
Corpus is now 3381978 characters long

Reading '../data/hin_corp_unicode/143_utf.txt'...
Corpus is now 3394604 characters long

Reading '../data/hin_corp_unicode/144_utf.txt'...
Corpus is now 3406369 characters long

Reading '../data/hin_corp_unicode/145_utf.txt'...
Corpus is now 3418696 characters long

Reading '../data/hin_corp_unicode/146_utf.txt'...
Corpus is now 3430965 characters long

Reading '../data/hin_corp_unicode/147_utf.txt'...
Corpus is now 3442255 characters long

Reading '../data/hin_corp_unicode/148_utf.txt'...
Corpus is now 3455283 characters long

Reading '../data/hin_corp_unicode/149_utf.txt'...
Corpus is now 3467130 characters long

Reading '../data/hin_corp_unicode/14_utf8.txt'...
Corpus is now 3467130 characters long

Reading '../data/hin_corp_unicode/150_utf.txt'...
Corpus is now 3480050 characters long

Reading '../data/hin_corp_unicode/151_utf.txt'...
Corpus is now 3495987 characters long

Reading '../data/hin_corp_unicode/152_utf.txt'...
Corpus is now 3505838 characters long

Reading '../data/hin_corp_unicode/153_utf.txt'...
Corpus is now 3514622 characters long

Reading '../data/hin_corp_unicode/154_utf.txt'...
Corpus is now 3522729 characters long

Reading '../data/hin_corp_unicode/155_utf.txt'...
Corpus is now 3532040 characters long

Reading '../data/hin_corp_unicode/156_utf.txt'...
Corpus is now 3541837 characters long

Reading '../data/hin_corp_unicode/157_utf.txt'...
Corpus is now 3550838 characters long

Reading '../data/hin_corp_unicode/158_utf.txt'...
Corpus is now 3564716 characters long

Reading '../data/hin_corp_unicode/15_utf8.txt'...
Corpus is now 3564716 characters long

Reading '../data/hin_corp_unicode/160_utf.txt'...
Corpus is now 3580209 characters long

Reading '../data/hin_corp_unicode/161_utf.txt'...
Corpus is now 3592993 characters long

Reading '../data/hin_corp_unicode/162_utf.txt'...
Corpus is now 3602175 characters long

Reading '../data/hin_corp_unicode/163_utf.txt'...
Corpus is now 3611018 characters long

Reading '../data/hin_corp_unicode/164_utf.txt'...
Corpus is now 3620033 characters long

Reading '../data/hin_corp_unicode/165_utf.txt'...
Corpus is now 3629194 characters long

Reading '../data/hin_corp_unicode/166_utf.txt'...
Corpus is now 3636956 characters long

Reading '../data/hin_corp_unicode/167_utf.txt'...
Corpus is now 3646730 characters long

Reading '../data/hin_corp_unicode/168_utf.txt'...
Corpus is now 3659282 characters long

Reading '../data/hin_corp_unicode/169_utf.txt'...
Corpus is now 3668692 characters long

Reading '../data/hin_corp_unicode/16_utf8.txt'...
Corpus is now 3668692 characters long

Reading '../data/hin_corp_unicode/170_utf.txt'...
Corpus is now 3676910 characters long

Reading '../data/hin_corp_unicode/171_utf.txt'...
Corpus is now 3689307 characters long

Reading '../data/hin_corp_unicode/172_utf.txt'...
Corpus is now 3703359 characters long

Reading '../data/hin_corp_unicode/173_utf.txt'...
Corpus is now 3714971 characters long

Reading '../data/hin_corp_unicode/174_utf.txt'...
Corpus is now 3728024 characters long

Reading '../data/hin_corp_unicode/175_utf.txt'...
Corpus is now 3742170 characters long

Reading '../data/hin_corp_unicode/176_utf.txt'...
Corpus is now 3754765 characters long

Reading '../data/hin_corp_unicode/177_utf.txt'...
Corpus is now 3768692 characters long

Reading '../data/hin_corp_unicode/178_utf.txt'...
Corpus is now 3783267 characters long

Reading '../data/hin_corp_unicode/179_utf.txt'...
Corpus is now 3798872 characters long

Reading '../data/hin_corp_unicode/17_utf8.txt'...
Corpus is now 3798872 characters long

Reading '../data/hin_corp_unicode/180_utf.txt'...
Corpus is now 3806384 characters long

Reading '../data/hin_corp_unicode/181_utf.txt'...
Corpus is now 3817090 characters long

Reading '../data/hin_corp_unicode/182_utf.txt'...
Corpus is now 3830451 characters long

Reading '../data/hin_corp_unicode/183_utf.txt'...
Corpus is now 3847490 characters long

Reading '../data/hin_corp_unicode/184_utf.txt'...
Corpus is now 3859543 characters long

Reading '../data/hin_corp_unicode/185_utf.txt'...
Corpus is now 3872552 characters long

Reading '../data/hin_corp_unicode/186_utf.txt'...
Corpus is now 3890828 characters long

Reading '../data/hin_corp_unicode/187_utf.txt'...
Corpus is now 3898888 characters long

Reading '../data/hin_corp_unicode/188_utf.txt'...
Corpus is now 3908924 characters long

Reading '../data/hin_corp_unicode/189_utf.txt'...
Corpus is now 3916870 characters long

Reading '../data/hin_corp_unicode/18_utf8.txt'...
Corpus is now 3916870 characters long

Reading '../data/hin_corp_unicode/190_utf.txt'...
Corpus is now 3928991 characters long

Reading '../data/hin_corp_unicode/191_utf.txt'...
Corpus is now 3936493 characters long

Reading '../data/hin_corp_unicode/192_utf.txt'...
Corpus is now 3948575 characters long

Reading '../data/hin_corp_unicode/193_utf.txt'...
Corpus is now 3960820 characters long

Reading '../data/hin_corp_unicode/194_utf.txt'...
Corpus is now 3975794 characters long

Reading '../data/hin_corp_unicode/195_utf.txt'...
Corpus is now 3987568 characters long

Reading '../data/hin_corp_unicode/196_utf.txt'...
Corpus is now 4001763 characters long

Reading '../data/hin_corp_unicode/197_utf.txt'...
Corpus is now 4020747 characters long

Reading '../data/hin_corp_unicode/198_utf.txt'...
Corpus is now 4032134 characters long

Reading '../data/hin_corp_unicode/199_utf.txt'...
Corpus is now 4041001 characters long

Reading '../data/hin_corp_unicode/19_utf8.txt'...
Corpus is now 4041001 characters long

Reading '../data/hin_corp_unicode/1_utf8.txt'...
Corpus is now 4041001 characters long

Reading '../data/hin_corp_unicode/200_utf.txt'...
Corpus is now 4041001 characters long

Reading '../data/hin_corp_unicode/201_utf.txt'...
Corpus is now 4052758 characters long

Reading '../data/hin_corp_unicode/202_utf.txt'...
Corpus is now 4063136 characters long

Reading '../data/hin_corp_unicode/203_utf.txt'...
Corpus is now 4075248 characters long

Reading '../data/hin_corp_unicode/204_utf.txt'...
Corpus is now 4085763 characters long

Reading '../data/hin_corp_unicode/205_utf.txt'...
Corpus is now 4101854 characters long

Reading '../data/hin_corp_unicode/206_utf.txt'...
Corpus is now 4114766 characters long

Reading '../data/hin_corp_unicode/207_utf.txt'...
Corpus is now 4128165 characters long

Reading '../data/hin_corp_unicode/208_utf.txt'...
Corpus is now 4143181 characters long

Reading '../data/hin_corp_unicode/209_utf.txt'...
Corpus is now 4155646 characters long

Reading '../data/hin_corp_unicode/20_utf8.txt'...
Corpus is now 4155646 characters long

Reading '../data/hin_corp_unicode/210_utf.txt'...
Corpus is now 4168357 characters long

Reading '../data/hin_corp_unicode/211_utf.txt'...
Corpus is now 4182887 characters long

Reading '../data/hin_corp_unicode/212_utf.txt'...
Corpus is now 4202240 characters long

Reading '../data/hin_corp_unicode/213_utf.txt'...
Corpus is now 4217115 characters long

Reading '../data/hin_corp_unicode/214_utf.txt'...
Corpus is now 4230117 characters long

Reading '../data/hin_corp_unicode/215_utf.txt'...
Corpus is now 4246072 characters long

Reading '../data/hin_corp_unicode/216_utf.txt'...
Corpus is now 4261517 characters long

Reading '../data/hin_corp_unicode/217_utf.txt'...
Corpus is now 4275997 characters long

Reading '../data/hin_corp_unicode/218_utf.txt'...
Corpus is now 4289869 characters long

Reading '../data/hin_corp_unicode/219_utf.txt'...
Corpus is now 4304987 characters long

Reading '../data/hin_corp_unicode/21_utf8.txt'...
Corpus is now 4311352 characters long

Reading '../data/hin_corp_unicode/220_utf.txt'...
Corpus is now 4325463 characters long

Reading '../data/hin_corp_unicode/221_utf.txt'...
Corpus is now 4338770 characters long

Reading '../data/hin_corp_unicode/222_utf.txt'...
Corpus is now 4352546 characters long

Reading '../data/hin_corp_unicode/223_utf.txt'...
Corpus is now 4365854 characters long

Reading '../data/hin_corp_unicode/224_utf.txt'...
Corpus is now 4379088 characters long

Reading '../data/hin_corp_unicode/225_utf.txt'...
Corpus is now 4392052 characters long

Reading '../data/hin_corp_unicode/226_utf.txt'...
Corpus is now 4406799 characters long

Reading '../data/hin_corp_unicode/227_utf.txt'...
Corpus is now 4421189 characters long

Reading '../data/hin_corp_unicode/228_utf.txt'...
Corpus is now 4435584 characters long

Reading '../data/hin_corp_unicode/229_utf.txt'...
Corpus is now 4450868 characters long

Reading '../data/hin_corp_unicode/22_utf8.txt'...
Corpus is now 4461686 characters long

Reading '../data/hin_corp_unicode/230_utf.txt'...
Corpus is now 4476164 characters long

Reading '../data/hin_corp_unicode/231_utf.txt'...
Corpus is now 4491097 characters long

Reading '../data/hin_corp_unicode/232_utf.txt'...
Corpus is now 4508902 characters long

Reading '../data/hin_corp_unicode/233_utf.txt'...
Corpus is now 4522731 characters long

Reading '../data/hin_corp_unicode/234_utf.txt'...
Corpus is now 4534967 characters long

Reading '../data/hin_corp_unicode/235_utf.txt'...
Corpus is now 4550081 characters long

Reading '../data/hin_corp_unicode/236_utf.txt'...
Corpus is now 4566495 characters long

Reading '../data/hin_corp_unicode/237_utf.txt'...
Corpus is now 4577398 characters long

Reading '../data/hin_corp_unicode/238_utf.txt'...
Corpus is now 4588329 characters long

Reading '../data/hin_corp_unicode/239_utf.txt'...
Corpus is now 4599895 characters long

Reading '../data/hin_corp_unicode/23_utf8.txt'...
Corpus is now 4613192 characters long

Reading '../data/hin_corp_unicode/240_utf.txt'...
Corpus is now 4624918 characters long

Reading '../data/hin_corp_unicode/241_utf.txt'...
Corpus is now 4636262 characters long

Reading '../data/hin_corp_unicode/242_utf.txt'...
Corpus is now 4653258 characters long

Reading '../data/hin_corp_unicode/243_utf.txt'...
Corpus is now 4664241 characters long

Reading '../data/hin_corp_unicode/244_utf.txt'...
Corpus is now 4676462 characters long

Reading '../data/hin_corp_unicode/245_utf.txt'...
Corpus is now 4687854 characters long

Reading '../data/hin_corp_unicode/246_utf.txt'...
Corpus is now 4700843 characters long

Reading '../data/hin_corp_unicode/247_utf.txt'...
Corpus is now 4713886 characters long

Reading '../data/hin_corp_unicode/248_utf.txt'...
Corpus is now 4727388 characters long

Reading '../data/hin_corp_unicode/249_utf.txt'...
Corpus is now 4742026 characters long

Reading '../data/hin_corp_unicode/24_utf8.txt'...
Corpus is now 4755433 characters long

Reading '../data/hin_corp_unicode/250_utf.txt'...
Corpus is now 4768480 characters long

Reading '../data/hin_corp_unicode/251_utf.txt'...
Corpus is now 4783503 characters long

Reading '../data/hin_corp_unicode/252_utf.txt'...
Corpus is now 4798170 characters long

Reading '../data/hin_corp_unicode/253_utf.txt'...
Corpus is now 4811340 characters long

Reading '../data/hin_corp_unicode/254_utf.txt'...
Corpus is now 4824161 characters long

Reading '../data/hin_corp_unicode/255_utf.txt'...
Corpus is now 4839034 characters long

Reading '../data/hin_corp_unicode/256_utf.txt'...
Corpus is now 4855757 characters long

Reading '../data/hin_corp_unicode/257_utf.txt'...
Corpus is now 4872701 characters long

Reading '../data/hin_corp_unicode/258_utf.txt'...
Corpus is now 4895079 characters long

Reading '../data/hin_corp_unicode/25_utf8.txt'...
Corpus is now 4906841 characters long

Reading '../data/hin_corp_unicode/260_utf.txt'...
Corpus is now 4920048 characters long

Reading '../data/hin_corp_unicode/261_utf.txt'...
Corpus is now 4934702 characters long

Reading '../data/hin_corp_unicode/262_utf.txt'...
Corpus is now 4949777 characters long

Reading '../data/hin_corp_unicode/263_utf.txt'...
Corpus is now 4964616 characters long

Reading '../data/hin_corp_unicode/264_utf.txt'...
Corpus is now 4978252 characters long

Reading '../data/hin_corp_unicode/265_utf.txt'...
Corpus is now 4993192 characters long

Reading '../data/hin_corp_unicode/266_utf.txt'...
Corpus is now 5006249 characters long

Reading '../data/hin_corp_unicode/267_utf.txt'...
Corpus is now 5019571 characters long

Reading '../data/hin_corp_unicode/268_utf.txt'...
Corpus is now 5035939 characters long

Reading '../data/hin_corp_unicode/269_utf.txt'...
Corpus is now 5049041 characters long

Reading '../data/hin_corp_unicode/26_utf8.txt'...
Corpus is now 5061318 characters long

Reading '../data/hin_corp_unicode/270_utf.txt'...
Corpus is now 5074241 characters long

Reading '../data/hin_corp_unicode/271_utf.txt'...
Corpus is now 5086540 characters long

Reading '../data/hin_corp_unicode/272_utf.txt'...
Corpus is now 5098601 characters long

Reading '../data/hin_corp_unicode/273_utf.txt'...
Corpus is now 5112467 characters long

Reading '../data/hin_corp_unicode/274_utf.txt'...
Corpus is now 5126436 characters long

Reading '../data/hin_corp_unicode/275_utf.txt'...
Corpus is now 5141559 characters long

Reading '../data/hin_corp_unicode/276_utf.txt'...
Corpus is now 5157986 characters long

Reading '../data/hin_corp_unicode/277_utf.txt'...
Corpus is now 5170056 characters long

Reading '../data/hin_corp_unicode/278_utf.txt'...
Corpus is now 5183801 characters long

Reading '../data/hin_corp_unicode/279_utf.txt'...
Corpus is now 5196903 characters long

Reading '../data/hin_corp_unicode/27_utf8.txt'...
Corpus is now 5209273 characters long

Reading '../data/hin_corp_unicode/280_utf.txt'...
Corpus is now 5223063 characters long

Reading '../data/hin_corp_unicode/281_utf.txt'...
Corpus is now 5237222 characters long

Reading '../data/hin_corp_unicode/282_utf.txt'...
Corpus is now 5250061 characters long

Reading '../data/hin_corp_unicode/283_utf.txt'...
Corpus is now 5265063 characters long

Reading '../data/hin_corp_unicode/284_utf.txt'...
Corpus is now 5276532 characters long

Reading '../data/hin_corp_unicode/285_utf.txt'...
Corpus is now 5288264 characters long

Reading '../data/hin_corp_unicode/286_utf.txt'...
Corpus is now 5303228 characters long

Reading '../data/hin_corp_unicode/287_utf.txt'...
Corpus is now 5315065 characters long

Reading '../data/hin_corp_unicode/288_utf.txt'...
Corpus is now 5327824 characters long

Reading '../data/hin_corp_unicode/289_utf.txt'...
Corpus is now 5342528 characters long

Reading '../data/hin_corp_unicode/28_utf8.txt'...
Corpus is now 5352332 characters long

Reading '../data/hin_corp_unicode/290_utf.txt'...
Corpus is now 5363925 characters long

Reading '../data/hin_corp_unicode/291_utf.txt'...
Corpus is now 5379707 characters long

Reading '../data/hin_corp_unicode/292_utf.txt'...
Corpus is now 5390393 characters long

Reading '../data/hin_corp_unicode/293_utf.txt'...
Corpus is now 5401551 characters long

Reading '../data/hin_corp_unicode/294_utf.txt'...
Corpus is now 5414925 characters long

Reading '../data/hin_corp_unicode/295_utf.txt'...
Corpus is now 5428209 characters long

Reading '../data/hin_corp_unicode/296_utf.txt'...
Corpus is now 5441870 characters long

Reading '../data/hin_corp_unicode/297_utf.txt'...
Corpus is now 5452633 characters long

Reading '../data/hin_corp_unicode/298_utf.txt'...
Corpus is now 5466687 characters long

Reading '../data/hin_corp_unicode/299_utf.txt'...
Corpus is now 5479385 characters long

Reading '../data/hin_corp_unicode/29_utf8.txt'...
Corpus is now 5493181 characters long

Reading '../data/hin_corp_unicode/2_utf8.txt'...
Corpus is now 5493181 characters long

Reading '../data/hin_corp_unicode/300_utf.txt'...
Corpus is now 5505683 characters long

Reading '../data/hin_corp_unicode/301_utf.txt'...
Corpus is now 5518052 characters long

Reading '../data/hin_corp_unicode/302_utf.txt'...
Corpus is now 5529675 characters long

Reading '../data/hin_corp_unicode/303_utf.txt'...
Corpus is now 5544052 characters long

Reading '../data/hin_corp_unicode/304_utf.txt'...
Corpus is now 5558153 characters long

Reading '../data/hin_corp_unicode/305_utf.txt'...
Corpus is now 5570521 characters long

Reading '../data/hin_corp_unicode/306_utf.txt'...
Corpus is now 5584865 characters long

Reading '../data/hin_corp_unicode/307_utf.txt'...
Corpus is now 5598842 characters long

Reading '../data/hin_corp_unicode/308_utf.txt'...
Corpus is now 5616143 characters long

Reading '../data/hin_corp_unicode/309_utf.txt'...
Corpus is now 5630296 characters long

Reading '../data/hin_corp_unicode/30_utf8.txt'...
Corpus is now 5643001 characters long

Reading '../data/hin_corp_unicode/310_utf.txt'...
Corpus is now 5656845 characters long

Reading '../data/hin_corp_unicode/311_utf.txt'...
Corpus is now 5670433 characters long

Reading '../data/hin_corp_unicode/312_utf.txt'...
Corpus is now 5682342 characters long

Reading '../data/hin_corp_unicode/313_utf.txt'...
Corpus is now 5697294 characters long

Reading '../data/hin_corp_unicode/314_utf.txt'...
Corpus is now 5708604 characters long

Reading '../data/hin_corp_unicode/315_utf.txt'...
Corpus is now 5720294 characters long

Reading '../data/hin_corp_unicode/316_utf.txt'...
Corpus is now 5732291 characters long

Reading '../data/hin_corp_unicode/317_utf.txt'...
Corpus is now 5747173 characters long

Reading '../data/hin_corp_unicode/318_utf.txt'...
Corpus is now 5767604 characters long

Reading '../data/hin_corp_unicode/319_utf.txt'...
Corpus is now 5782702 characters long

Reading '../data/hin_corp_unicode/31_utf8.txt'...
Corpus is now 5804959 characters long

Reading '../data/hin_corp_unicode/320_utf.txt'...
Corpus is now 5818954 characters long

Reading '../data/hin_corp_unicode/321_utf.txt'...
Corpus is now 5836543 characters long

Reading '../data/hin_corp_unicode/322_utf.txt'...
Corpus is now 5850598 characters long

Reading '../data/hin_corp_unicode/323_utf.txt'...
Corpus is now 5865063 characters long

Reading '../data/hin_corp_unicode/324_utf.txt'...
Corpus is now 5878976 characters long

Reading '../data/hin_corp_unicode/325_utf.txt'...
Corpus is now 5892089 characters long

Reading '../data/hin_corp_unicode/326_utf.txt'...
Corpus is now 5903870 characters long

Reading '../data/hin_corp_unicode/327_utf.txt'...
Corpus is now 5919368 characters long

Reading '../data/hin_corp_unicode/328_utf.txt'...
Corpus is now 5936108 characters long

Reading '../data/hin_corp_unicode/329_utf.txt'...
Corpus is now 5952521 characters long

Reading '../data/hin_corp_unicode/32_utf8.txt'...
Corpus is now 5962310 characters long

Reading '../data/hin_corp_unicode/330_utf.txt'...
Corpus is now 5976174 characters long

Reading '../data/hin_corp_unicode/331_utf.txt'...
Corpus is now 5992148 characters long

Reading '../data/hin_corp_unicode/332_utf.txt'...
Corpus is now 6006692 characters long

Reading '../data/hin_corp_unicode/333_utf.txt'...
Corpus is now 6021066 characters long

Reading '../data/hin_corp_unicode/334_utf.txt'...
Corpus is now 6037471 characters long

Reading '../data/hin_corp_unicode/335_utf.txt'...
Corpus is now 6053659 characters long

Reading '../data/hin_corp_unicode/336_utf.txt'...
Corpus is now 6071558 characters long

Reading '../data/hin_corp_unicode/337_utf.txt'...
Corpus is now 6088938 characters long

Reading '../data/hin_corp_unicode/338_utf.txt'...
Corpus is now 6108104 characters long

Reading '../data/hin_corp_unicode/339_utf.txt'...
Corpus is now 6118465 characters long

Reading '../data/hin_corp_unicode/33_utf8.txt'...
Corpus is now 6129532 characters long

Reading '../data/hin_corp_unicode/340_utf.txt'...
Corpus is now 6148981 characters long

Reading '../data/hin_corp_unicode/341_utf.txt'...
Corpus is now 6163494 characters long

Reading '../data/hin_corp_unicode/342_utf.txt'...
Corpus is now 6178613 characters long

Reading '../data/hin_corp_unicode/343_utf.txt'...
Corpus is now 6194274 characters long

Reading '../data/hin_corp_unicode/344_utf.txt'...
Corpus is now 6207504 characters long

Reading '../data/hin_corp_unicode/345_utf.txt'...
Corpus is now 6221186 characters long

Reading '../data/hin_corp_unicode/346_utf.txt'...
Corpus is now 6232709 characters long

Reading '../data/hin_corp_unicode/347_utf.txt'...
Corpus is now 6244051 characters long

Reading '../data/hin_corp_unicode/348_utf.txt'...
Corpus is now 6259045 characters long

Reading '../data/hin_corp_unicode/349_utf.txt'...
Corpus is now 6272195 characters long

Reading '../data/hin_corp_unicode/34_utf8.txt'...
Corpus is now 6282579 characters long

Reading '../data/hin_corp_unicode/350_utf.txt'...
Corpus is now 6295783 characters long

Reading '../data/hin_corp_unicode/351_utf.txt'...
Corpus is now 6309577 characters long

Reading '../data/hin_corp_unicode/352_utf.txt'...
Corpus is now 6321745 characters long

Reading '../data/hin_corp_unicode/353_utf.txt'...
Corpus is now 6335630 characters long

Reading '../data/hin_corp_unicode/354_utf.txt'...
Corpus is now 6350010 characters long

Reading '../data/hin_corp_unicode/355_utf.txt'...
Corpus is now 6362369 characters long

Reading '../data/hin_corp_unicode/356_utf.txt'...
Corpus is now 6375050 characters long

Reading '../data/hin_corp_unicode/357_utf.txt'...
Corpus is now 6386793 characters long

Reading '../data/hin_corp_unicode/358_utf.txt'...
Corpus is now 6399710 characters long

Reading '../data/hin_corp_unicode/35_utf8.txt'...
Corpus is now 6412740 characters long

Reading '../data/hin_corp_unicode/360_utf.txt'...
Corpus is now 6428857 characters long

Reading '../data/hin_corp_unicode/361_utf.txt'...
Corpus is now 6440934 characters long

Reading '../data/hin_corp_unicode/362_utf.txt'...
Corpus is now 6450700 characters long

Reading '../data/hin_corp_unicode/363_utf.txt'...
Corpus is now 6461605 characters long

Reading '../data/hin_corp_unicode/364_utf.txt'...
Corpus is now 6474160 characters long

Reading '../data/hin_corp_unicode/365_utf.txt'...
Corpus is now 6488238 characters long

Reading '../data/hin_corp_unicode/366_utf.txt'...
Corpus is now 6501033 characters long

Reading '../data/hin_corp_unicode/367_utf.txt'...
Corpus is now 6513610 characters long

Reading '../data/hin_corp_unicode/368_utf.txt'...
Corpus is now 6527693 characters long

Reading '../data/hin_corp_unicode/369_utf.txt'...
Corpus is now 6540292 characters long

Reading '../data/hin_corp_unicode/36_utf8.txt'...
Corpus is now 6551583 characters long

Reading '../data/hin_corp_unicode/370_utf.txt'...
Corpus is now 6564508 characters long

Reading '../data/hin_corp_unicode/371_utf.txt'...
Corpus is now 6577179 characters long

Reading '../data/hin_corp_unicode/372_utf.txt'...
Corpus is now 6591352 characters long

Reading '../data/hin_corp_unicode/373_utf.txt'...
Corpus is now 6606773 characters long

Reading '../data/hin_corp_unicode/374_utf.txt'...
Corpus is now 6624108 characters long

Reading '../data/hin_corp_unicode/375_utf.txt'...
Corpus is now 6639503 characters long

Reading '../data/hin_corp_unicode/376_utf.txt'...
Corpus is now 6651629 characters long

Reading '../data/hin_corp_unicode/377_utf.txt'...
Corpus is now 6664979 characters long

Reading '../data/hin_corp_unicode/378_utf.txt'...
Corpus is now 6680668 characters long

Reading '../data/hin_corp_unicode/379_utf.txt'...
Corpus is now 6693267 characters long

Reading '../data/hin_corp_unicode/37_utf8.txt'...
Corpus is now 6707011 characters long

Reading '../data/hin_corp_unicode/380_utf.txt'...
Corpus is now 6719627 characters long

Reading '../data/hin_corp_unicode/381_utf.txt'...
Corpus is now 6734723 characters long

Reading '../data/hin_corp_unicode/382_utf.txt'...
Corpus is now 6747607 characters long

Reading '../data/hin_corp_unicode/383_utf.txt'...
Corpus is now 6759683 characters long

Reading '../data/hin_corp_unicode/384_utf.txt'...
Corpus is now 6772928 characters long

Reading '../data/hin_corp_unicode/385_utf.txt'...
Corpus is now 6787050 characters long

Reading '../data/hin_corp_unicode/386_utf.txt'...
Corpus is now 6802286 characters long

Reading '../data/hin_corp_unicode/387_utf.txt'...
Corpus is now 6814185 characters long

Reading '../data/hin_corp_unicode/388_utf.txt'...
Corpus is now 6829350 characters long

Reading '../data/hin_corp_unicode/389_utf.txt'...
Corpus is now 6843262 characters long

Reading '../data/hin_corp_unicode/38_utf8.txt'...
Corpus is now 6855693 characters long

Reading '../data/hin_corp_unicode/390_utf.txt'...
Corpus is now 6868591 characters long

Reading '../data/hin_corp_unicode/391_utf.txt'...
Corpus is now 6882134 characters long

Reading '../data/hin_corp_unicode/392_utf.txt'...
Corpus is now 6898836 characters long

Reading '../data/hin_corp_unicode/393_utf.txt'...
Corpus is now 6912639 characters long

Reading '../data/hin_corp_unicode/394_utf.txt'...
Corpus is now 6924197 characters long

Reading '../data/hin_corp_unicode/395_utf.txt'...
Corpus is now 6936502 characters long

Reading '../data/hin_corp_unicode/396_utf.txt'...
Corpus is now 6949813 characters long

Reading '../data/hin_corp_unicode/397_utf.txt'...
Corpus is now 6960430 characters long

Reading '../data/hin_corp_unicode/398_utf.txt'...
Corpus is now 6975245 characters long

Reading '../data/hin_corp_unicode/399_utf.txt'...
Corpus is now 6988217 characters long

Reading '../data/hin_corp_unicode/39_utf8.txt'...
Corpus is now 6999459 characters long

Reading '../data/hin_corp_unicode/3_utf8.txt'...
Corpus is now 6999459 characters long

Reading '../data/hin_corp_unicode/400_utf.txt'...
Corpus is now 7013486 characters long

Reading '../data/hin_corp_unicode/401_utf.txt'...
Corpus is now 7025134 characters long

Reading '../data/hin_corp_unicode/402_utf.txt'...
Corpus is now 7039469 characters long

Reading '../data/hin_corp_unicode/403_utf.txt'...
Corpus is now 7054034 characters long

Reading '../data/hin_corp_unicode/404_utf.txt'...
Corpus is now 7068362 characters long

Reading '../data/hin_corp_unicode/405_utf.txt'...
Corpus is now 7082339 characters long

Reading '../data/hin_corp_unicode/406_utf.txt'...
Corpus is now 7099268 characters long

Reading '../data/hin_corp_unicode/407_utf.txt'...
Corpus is now 7111876 characters long

Reading '../data/hin_corp_unicode/408_utf.txt'...
Corpus is now 7124652 characters long

Reading '../data/hin_corp_unicode/409_utf.txt'...
Corpus is now 7136719 characters long

Reading '../data/hin_corp_unicode/40_utf8.txt'...
Corpus is now 7146638 characters long

Reading '../data/hin_corp_unicode/410_utf.txt'...
Corpus is now 7163273 characters long

Reading '../data/hin_corp_unicode/411_utf.txt'...
Corpus is now 7176509 characters long

Reading '../data/hin_corp_unicode/412_utf.txt'...
Corpus is now 7188767 characters long

Reading '../data/hin_corp_unicode/413_utf.txt'...
Corpus is now 7201834 characters long

Reading '../data/hin_corp_unicode/414_utf.txt'...
Corpus is now 7217561 characters long

Reading '../data/hin_corp_unicode/415_utf.txt'...
Corpus is now 7232791 characters long

Reading '../data/hin_corp_unicode/416_utf.txt'...
Corpus is now 7245283 characters long

Reading '../data/hin_corp_unicode/417_utf.txt'...
Corpus is now 7266530 characters long

Reading '../data/hin_corp_unicode/418_utf.txt'...
Corpus is now 7283588 characters long

Reading '../data/hin_corp_unicode/419_utf.txt'...
Corpus is now 7298947 characters long

Reading '../data/hin_corp_unicode/420_utf.txt'...
Corpus is now 7315252 characters long

Reading '../data/hin_corp_unicode/421_utf.txt'...
Corpus is now 7330352 characters long

Reading '../data/hin_corp_unicode/422_utf.txt'...
Corpus is now 7343230 characters long

Reading '../data/hin_corp_unicode/423_utf.txt'...
Corpus is now 7357580 characters long

Reading '../data/hin_corp_unicode/424_utf.txt'...
Corpus is now 7371572 characters long

Reading '../data/hin_corp_unicode/425_utf.txt'...
Corpus is now 7387240 characters long

Reading '../data/hin_corp_unicode/426_utf.txt'...
Corpus is now 7400374 characters long

Reading '../data/hin_corp_unicode/427_utf.txt'...
Corpus is now 7414026 characters long

Reading '../data/hin_corp_unicode/428_utf.txt'...
Corpus is now 7428639 characters long

Reading '../data/hin_corp_unicode/429_utf.txt'...
Corpus is now 7442850 characters long

Reading '../data/hin_corp_unicode/430_utf.txt'...
Corpus is now 7457952 characters long

Reading '../data/hin_corp_unicode/431_utf.txt'...
Corpus is now 7473105 characters long

Reading '../data/hin_corp_unicode/432_utf.txt'...
Corpus is now 7488689 characters long

Reading '../data/hin_corp_unicode/433_utf.txt'...
Corpus is now 7503672 characters long

Reading '../data/hin_corp_unicode/434_utf.txt'...
Corpus is now 7517701 characters long

Reading '../data/hin_corp_unicode/435_utf.txt'...
Corpus is now 7531879 characters long

Reading '../data/hin_corp_unicode/436_utf.txt'...
Corpus is now 7547692 characters long

Reading '../data/hin_corp_unicode/437_utf.txt'...
Corpus is now 7562106 characters long

Reading '../data/hin_corp_unicode/438_utf.txt'...
Corpus is now 7575694 characters long

Reading '../data/hin_corp_unicode/439_utf.txt'...
Corpus is now 7594305 characters long

Reading '../data/hin_corp_unicode/440_utf.txt'...
Corpus is now 7610944 characters long

Reading '../data/hin_corp_unicode/441_utf.txt'...
Corpus is now 7627367 characters long

Reading '../data/hin_corp_unicode/442_utf.txt'...
Corpus is now 7645649 characters long

Reading '../data/hin_corp_unicode/443_utf.txt'...
Corpus is now 7662992 characters long

Reading '../data/hin_corp_unicode/444_utf.txt'...
Corpus is now 7682322 characters long

Reading '../data/hin_corp_unicode/445_utf.txt'...
Corpus is now 7697636 characters long

Reading '../data/hin_corp_unicode/446_utf.txt'...
Corpus is now 7711158 characters long

Reading '../data/hin_corp_unicode/447_utf.txt'...
Corpus is now 7724052 characters long

Reading '../data/hin_corp_unicode/448_utf.txt'...
Corpus is now 7737567 characters long

Reading '../data/hin_corp_unicode/449_utf.txt'...
Corpus is now 7749320 characters long

Reading '../data/hin_corp_unicode/450_utf.txt'...
Corpus is now 7763551 characters long

Reading '../data/hin_corp_unicode/451_utf.txt'...
Corpus is now 7777173 characters long

Reading '../data/hin_corp_unicode/452_utf.txt'...
Corpus is now 7790221 characters long

Reading '../data/hin_corp_unicode/453_utf.txt'...
Corpus is now 7803608 characters long

Reading '../data/hin_corp_unicode/454_utf.txt'...
Corpus is now 7815934 characters long

Reading '../data/hin_corp_unicode/455_utf.txt'...
Corpus is now 7829445 characters long

Reading '../data/hin_corp_unicode/456_utf.txt'...
Corpus is now 7840741 characters long

Reading '../data/hin_corp_unicode/457_utf.txt'...
Corpus is now 7851962 characters long

Reading '../data/hin_corp_unicode/458_utf.txt'...
Corpus is now 7869995 characters long

Reading '../data/hin_corp_unicode/460_utf.txt'...
Corpus is now 7887448 characters long

Reading '../data/hin_corp_unicode/461_utf.txt'...
Corpus is now 7903312 characters long

Reading '../data/hin_corp_unicode/462_utf.txt'...
Corpus is now 7917839 characters long

Reading '../data/hin_corp_unicode/463_utf.txt'...
Corpus is now 7934287 characters long

Reading '../data/hin_corp_unicode/464_utf.txt'...
Corpus is now 7948455 characters long

Reading '../data/hin_corp_unicode/465_utf.txt'...
Corpus is now 7961474 characters long

Reading '../data/hin_corp_unicode/466_utf.txt'...
Corpus is now 7975712 characters long

Reading '../data/hin_corp_unicode/467_utf.txt'...
Corpus is now 7993439 characters long

Reading '../data/hin_corp_unicode/468_utf.txt'...
Corpus is now 8018423 characters long

Reading '../data/hin_corp_unicode/469_utf.txt'...
Corpus is now 8031944 characters long

Reading '../data/hin_corp_unicode/470_utf.txt'...
Corpus is now 8046851 characters long

Reading '../data/hin_corp_unicode/471_utf.txt'...
Corpus is now 8059473 characters long

Reading '../data/hin_corp_unicode/472_utf.txt'...
Corpus is now 8074922 characters long

Reading '../data/hin_corp_unicode/473_utf.txt'...
Corpus is now 8092468 characters long

Reading '../data/hin_corp_unicode/474_utf.txt'...
Corpus is now 8107816 characters long

Reading '../data/hin_corp_unicode/475_utf.txt'...
Corpus is now 8123542 characters long

Reading '../data/hin_corp_unicode/476_utf.txt'...
Corpus is now 8139389 characters long

Reading '../data/hin_corp_unicode/477_utf.txt'...
Corpus is now 8153068 characters long

Reading '../data/hin_corp_unicode/478_utf.txt'...
Corpus is now 8169661 characters long

Reading '../data/hin_corp_unicode/479_utf.txt'...
Corpus is now 8183182 characters long

Reading '../data/hin_corp_unicode/480_utf.txt'...
Corpus is now 8196859 characters long

Reading '../data/hin_corp_unicode/481_utf.txt'...
Corpus is now 8210888 characters long

Reading '../data/hin_corp_unicode/482_utf.txt'...
Corpus is now 8227394 characters long

Reading '../data/hin_corp_unicode/483_utf.txt'...
Corpus is now 8241604 characters long

Reading '../data/hin_corp_unicode/484_utf.txt'...
Corpus is now 8252810 characters long

Reading '../data/hin_corp_unicode/485_utf.txt'...
Corpus is now 8266078 characters long

Reading '../data/hin_corp_unicode/486_utf.txt'...
Corpus is now 8278876 characters long

Reading '../data/hin_corp_unicode/487_utf.txt'...
Corpus is now 8293353 characters long

Reading '../data/hin_corp_unicode/488_utf.txt'...
Corpus is now 8305262 characters long

Reading '../data/hin_corp_unicode/489_utf.txt'...
Corpus is now 8318755 characters long

Reading '../data/hin_corp_unicode/490_utf.txt'...
Corpus is now 8331629 characters long

Reading '../data/hin_corp_unicode/491_utf.txt'...
Corpus is now 8345192 characters long

Reading '../data/hin_corp_unicode/492_utf.txt'...
Corpus is now 8358961 characters long

Reading '../data/hin_corp_unicode/493_utf.txt'...
Corpus is now 8372297 characters long

Reading '../data/hin_corp_unicode/494_utf.txt'...
Corpus is now 8385846 characters long

Reading '../data/hin_corp_unicode/495_utf.txt'...
Corpus is now 8402341 characters long

Reading '../data/hin_corp_unicode/496_utf.txt'...
Corpus is now 8415750 characters long

Reading '../data/hin_corp_unicode/497_utf.txt'...
Corpus is now 8429965 characters long

Reading '../data/hin_corp_unicode/498_utf.txt'...
Corpus is now 8441988 characters long

Reading '../data/hin_corp_unicode/499_utf.txt'...
Corpus is now 8456165 characters long

Reading '../data/hin_corp_unicode/4_utf8.txt'...
Corpus is now 8456165 characters long

Reading '../data/hin_corp_unicode/500_utf.txt'...
Corpus is now 8469805 characters long

Reading '../data/hin_corp_unicode/501_utf.txt'...
Corpus is now 8484940 characters long

Reading '../data/hin_corp_unicode/502_utf.txt'...
Corpus is now 8497451 characters long

Reading '../data/hin_corp_unicode/503_utf.txt'...
Corpus is now 8511323 characters long

Reading '../data/hin_corp_unicode/504_utf.txt'...
Corpus is now 8526302 characters long

Reading '../data/hin_corp_unicode/505_utf.txt'...
Corpus is now 8540775 characters long

Reading '../data/hin_corp_unicode/506_utf.txt'...
Corpus is now 8556631 characters long

Reading '../data/hin_corp_unicode/507_utf.txt'...
Corpus is now 8571597 characters long

Reading '../data/hin_corp_unicode/508_utf.txt'...
Corpus is now 8585683 characters long

Reading '../data/hin_corp_unicode/509_utf.txt'...
Corpus is now 8598998 characters long

Reading '../data/hin_corp_unicode/510_utf.txt'...
Corpus is now 8612528 characters long

Reading '../data/hin_corp_unicode/511_utf.txt'...
Corpus is now 8626003 characters long

Reading '../data/hin_corp_unicode/512_utf.txt'...
Corpus is now 8639426 characters long

Reading '../data/hin_corp_unicode/513_utf.txt'...
Corpus is now 8653871 characters long

Reading '../data/hin_corp_unicode/514_utf.txt'...
Corpus is now 8667520 characters long

Reading '../data/hin_corp_unicode/515_utf.txt'...
Corpus is now 8681443 characters long

Reading '../data/hin_corp_unicode/516_utf.txt'...
Corpus is now 8698510 characters long

Reading '../data/hin_corp_unicode/517_utf.txt'...
Corpus is now 8711713 characters long

Reading '../data/hin_corp_unicode/518_utf.txt'...
Corpus is now 8727806 characters long

Reading '../data/hin_corp_unicode/519_utf.txt'...
Corpus is now 8741742 characters long

Reading '../data/hin_corp_unicode/520_utf.txt'...
Corpus is now 8755491 characters long

Reading '../data/hin_corp_unicode/521_utf.txt'...
Corpus is now 8767130 characters long

Reading '../data/hin_corp_unicode/522_utf.txt'...
Corpus is now 8780326 characters long

Reading '../data/hin_corp_unicode/523_utf.txt'...
Corpus is now 8793522 characters long

Reading '../data/hin_corp_unicode/524_utf.txt'...
Corpus is now 8806431 characters long

Reading '../data/hin_corp_unicode/525_utf.txt'...
Corpus is now 8820364 characters long

Reading '../data/hin_corp_unicode/526_utf.txt'...
Corpus is now 8834507 characters long

Reading '../data/hin_corp_unicode/527_utf.txt'...
Corpus is now 8846627 characters long

Reading '../data/hin_corp_unicode/528_utf.txt'...
Corpus is now 8858764 characters long

Reading '../data/hin_corp_unicode/529_utf.txt'...
Corpus is now 8873314 characters long

Reading '../data/hin_corp_unicode/530_utf.txt'...
Corpus is now 8890897 characters long

Reading '../data/hin_corp_unicode/531_utf.txt'...
Corpus is now 8905544 characters long

Reading '../data/hin_corp_unicode/532_utf.txt'...
Corpus is now 8916272 characters long

Reading '../data/hin_corp_unicode/533_utf.txt'...
Corpus is now 8929231 characters long

Reading '../data/hin_corp_unicode/534_utf.txt'...
Corpus is now 8943688 characters long

Reading '../data/hin_corp_unicode/535_utf.txt'...
Corpus is now 8955648 characters long

Reading '../data/hin_corp_unicode/536_utf.txt'...
Corpus is now 8971843 characters long

Reading '../data/hin_corp_unicode/537_utf.txt'...
Corpus is now 8983601 characters long

Reading '../data/hin_corp_unicode/538_utf.txt'...
Corpus is now 8997258 characters long

Reading '../data/hin_corp_unicode/539_utf.txt'...
Corpus is now 9006362 characters long

Reading '../data/hin_corp_unicode/540_utf.txt'...
Corpus is now 9017498 characters long

Reading '../data/hin_corp_unicode/541_utf.txt'...
Corpus is now 9026958 characters long

Reading '../data/hin_corp_unicode/542_utf.txt'...
Corpus is now 9037852 characters long

Reading '../data/hin_corp_unicode/543_utf.txt'...
Corpus is now 9049878 characters long

Reading '../data/hin_corp_unicode/544_utf.txt'...
Corpus is now 9064526 characters long

Reading '../data/hin_corp_unicode/545_utf.txt'...
Corpus is now 9076532 characters long

Reading '../data/hin_corp_unicode/546_utf.txt'...
Corpus is now 9086723 characters long

Reading '../data/hin_corp_unicode/547_utf.txt'...
Corpus is now 9098189 characters long

Reading '../data/hin_corp_unicode/548_utf.txt'...
Corpus is now 9107374 characters long

Reading '../data/hin_corp_unicode/549_utf.txt'...
Corpus is now 9118824 characters long

Reading '../data/hin_corp_unicode/550_utf.txt'...
Corpus is now 9131548 characters long

Reading '../data/hin_corp_unicode/551_utf.txt'...
Corpus is now 9143077 characters long

Reading '../data/hin_corp_unicode/552_utf.txt'...
Corpus is now 9154784 characters long

Reading '../data/hin_corp_unicode/553_utf.txt'...
Corpus is now 9169398 characters long

Reading '../data/hin_corp_unicode/554_utf.txt'...
Corpus is now 9180708 characters long

Reading '../data/hin_corp_unicode/555_utf.txt'...
Corpus is now 9195767 characters long

Reading '../data/hin_corp_unicode/556_utf.txt'...
Corpus is now 9206663 characters long

Reading '../data/hin_corp_unicode/557_utf.txt'...
Corpus is now 9219280 characters long

Reading '../data/hin_corp_unicode/558_utf.txt'...
Corpus is now 9229725 characters long

Reading '../data/hin_corp_unicode/560_utf.txt'...
Corpus is now 9241615 characters long

Reading '../data/hin_corp_unicode/561_utf.txt'...
Corpus is now 9254810 characters long

Reading '../data/hin_corp_unicode/562_utf.txt'...
Corpus is now 9265063 characters long

Reading '../data/hin_corp_unicode/563_utf.txt'...
Corpus is now 9274364 characters long

Reading '../data/hin_corp_unicode/564_utf.txt'...
Corpus is now 9285645 characters long

Reading '../data/hin_corp_unicode/565_utf.txt'...
Corpus is now 9295374 characters long

Reading '../data/hin_corp_unicode/566_utf.txt'...
Corpus is now 9306851 characters long

Reading '../data/hin_corp_unicode/567_utf.txt'...
Corpus is now 9316678 characters long

Reading '../data/hin_corp_unicode/568_utf.txt'...
Corpus is now 9327243 characters long

Reading '../data/hin_corp_unicode/569_utf.txt'...
Corpus is now 9338184 characters long

Reading '../data/hin_corp_unicode/570_utf.txt'...
Corpus is now 9351544 characters long

Reading '../data/hin_corp_unicode/571_utf.txt'...
Corpus is now 9365401 characters long

Reading '../data/hin_corp_unicode/572_utf.txt'...
Corpus is now 9374520 characters long

Reading '../data/hin_corp_unicode/573_utf.txt'...
Corpus is now 9386805 characters long

Reading '../data/hin_corp_unicode/574_utf.txt'...
Corpus is now 9395241 characters long

Reading '../data/hin_corp_unicode/575_utf.txt'...
Corpus is now 9400093 characters long

Reading '../data/hin_corp_unicode/576_utf.txt'...
Corpus is now 9407934 characters long

Reading '../data/hin_corp_unicode/577_utf.txt'...
Corpus is now 9418427 characters long

Reading '../data/hin_corp_unicode/578_utf.txt'...
Corpus is now 9430231 characters long

Reading '../data/hin_corp_unicode/579_utf.txt'...
Corpus is now 9441172 characters long

Reading '../data/hin_corp_unicode/580_utf.txt'...
Corpus is now 9453389 characters long

Reading '../data/hin_corp_unicode/581_utf.txt'...
Corpus is now 9464262 characters long

Reading '../data/hin_corp_unicode/582_utf.txt'...
Corpus is now 9473832 characters long

Reading '../data/hin_corp_unicode/583_utf.txt'...
Corpus is now 9485490 characters long

Reading '../data/hin_corp_unicode/584_utf.txt'...
Corpus is now 9498187 characters long

Reading '../data/hin_corp_unicode/585_utf.txt'...
Corpus is now 9509190 characters long

Reading '../data/hin_corp_unicode/586_utf.txt'...
Corpus is now 9519688 characters long

Reading '../data/hin_corp_unicode/587_utf.txt'...
Corpus is now 9530460 characters long

Reading '../data/hin_corp_unicode/588_utf.txt'...
Corpus is now 9543764 characters long

Reading '../data/hin_corp_unicode/589_utf.txt'...
Corpus is now 9554299 characters long

Reading '../data/hin_corp_unicode/590_utf.txt'...
Corpus is now 9565089 characters long

Reading '../data/hin_corp_unicode/591_utf.txt'...
Corpus is now 9579483 characters long

Reading '../data/hin_corp_unicode/592_utf.txt'...
Corpus is now 9592761 characters long

Reading '../data/hin_corp_unicode/593_utf.txt'...
Corpus is now 9601279 characters long

Reading '../data/hin_corp_unicode/594_utf.txt'...
Corpus is now 9611270 characters long

Reading '../data/hin_corp_unicode/595_utf.txt'...
Corpus is now 9622449 characters long

Reading '../data/hin_corp_unicode/596_utf.txt'...
Corpus is now 9636803 characters long

Reading '../data/hin_corp_unicode/597_utf.txt'...
Corpus is now 9649834 characters long

Reading '../data/hin_corp_unicode/598_utf.txt'...
Corpus is now 9661818 characters long

Reading '../data/hin_corp_unicode/599_utf.txt'...
Corpus is now 9669882 characters long

Reading '../data/hin_corp_unicode/5_utf8.txt'...
Corpus is now 9669882 characters long

Reading '../data/hin_corp_unicode/600_utf.txt'...
Corpus is now 9678989 characters long

Reading '../data/hin_corp_unicode/601_utf.txt'...
Corpus is now 9693980 characters long

Reading '../data/hin_corp_unicode/602_utf.txt'...
Corpus is now 9703015 characters long

Reading '../data/hin_corp_unicode/603_utf.txt'...
Corpus is now 9714949 characters long

Reading '../data/hin_corp_unicode/604_utf.txt'...
Corpus is now 9724043 characters long

Reading '../data/hin_corp_unicode/605_utf.txt'...
Corpus is now 9733824 characters long

Reading '../data/hin_corp_unicode/606_utf.txt'...
Corpus is now 9741902 characters long

Reading '../data/hin_corp_unicode/607_utf.txt'...
Corpus is now 9754762 characters long

Reading '../data/hin_corp_unicode/608_utf.txt'...
Corpus is now 9764558 characters long

Reading '../data/hin_corp_unicode/609_utf.txt'...
Corpus is now 9776955 characters long

Reading '../data/hin_corp_unicode/610_utf.txt'...
Corpus is now 9790713 characters long

Reading '../data/hin_corp_unicode/611_utf.txt'...
Corpus is now 9803202 characters long

Reading '../data/hin_corp_unicode/612_utf.txt'...
Corpus is now 9812263 characters long

Reading '../data/hin_corp_unicode/613_utf.txt'...
Corpus is now 9822935 characters long

Reading '../data/hin_corp_unicode/614_utf.txt'...
Corpus is now 9834136 characters long

Reading '../data/hin_corp_unicode/615_utf.txt'...
Corpus is now 9846079 characters long

Reading '../data/hin_corp_unicode/616_utf.txt'...
Corpus is now 9858447 characters long

Reading '../data/hin_corp_unicode/617_utf.txt'...
Corpus is now 9869061 characters long

Reading '../data/hin_corp_unicode/618_utf.txt'...
Corpus is now 9884364 characters long

Reading '../data/hin_corp_unicode/619_utf.txt'...
Corpus is now 9896932 characters long

Reading '../data/hin_corp_unicode/620_utf.txt'...
Corpus is now 9907821 characters long

Reading '../data/hin_corp_unicode/621_utf.txt'...
Corpus is now 9918185 characters long

Reading '../data/hin_corp_unicode/622_utf.txt'...
Corpus is now 9933619 characters long

Reading '../data/hin_corp_unicode/623_utf.txt'...
Corpus is now 9945753 characters long

Reading '../data/hin_corp_unicode/624_utf.txt'...
Corpus is now 9953667 characters long

Reading '../data/hin_corp_unicode/625_utf.txt'...
Corpus is now 9963913 characters long

Reading '../data/hin_corp_unicode/626_utf.txt'...
Corpus is now 9975284 characters long

Reading '../data/hin_corp_unicode/627_utf.txt'...
Corpus is now 9986741 characters long

Reading '../data/hin_corp_unicode/628_utf.txt'...
Corpus is now 9998917 characters long

Reading '../data/hin_corp_unicode/629_utf.txt'...
Corpus is now 10011638 characters long

Reading '../data/hin_corp_unicode/630_utf.txt'...
Corpus is now 10024681 characters long

Reading '../data/hin_corp_unicode/631_utf.txt'...
Corpus is now 10036216 characters long

Reading '../data/hin_corp_unicode/632_utf.txt'...
Corpus is now 10046779 characters long

Reading '../data/hin_corp_unicode/633_utf.txt'...
Corpus is now 10061616 characters long

Reading '../data/hin_corp_unicode/634_utf.txt'...
Corpus is now 10071378 characters long

Reading '../data/hin_corp_unicode/635_utf.txt'...
Corpus is now 10080607 characters long

Reading '../data/hin_corp_unicode/636_utf.txt'...
Corpus is now 10092553 characters long

Reading '../data/hin_corp_unicode/637_utf.txt'...
Corpus is now 10101782 characters long

Reading '../data/hin_corp_unicode/638_utf.txt'...
Corpus is now 10113728 characters long

Reading '../data/hin_corp_unicode/639_utf.txt'...
Corpus is now 10122437 characters long

Reading '../data/hin_corp_unicode/640_utf.txt'...
Corpus is now 10134201 characters long

Reading '../data/hin_corp_unicode/641_utf.txt'...
Corpus is now 10142806 characters long

Reading '../data/hin_corp_unicode/642_utf.txt'...
Corpus is now 10153390 characters long

Reading '../data/hin_corp_unicode/643_utf.txt'...
Corpus is now 10165368 characters long

Reading '../data/hin_corp_unicode/644_utf.txt'...
Corpus is now 10176006 characters long

Reading '../data/hin_corp_unicode/645_utf.txt'...
Corpus is now 10185999 characters long

Reading '../data/hin_corp_unicode/646_utf.txt'...
Corpus is now 10196005 characters long

Reading '../data/hin_corp_unicode/647_utf.txt'...
Corpus is now 10208225 characters long

Reading '../data/hin_corp_unicode/648_utf.txt'...
Corpus is now 10220254 characters long

Reading '../data/hin_corp_unicode/649_utf.txt'...
Corpus is now 10231185 characters long

Reading '../data/hin_corp_unicode/650_utf.txt'...
Corpus is now 10243288 characters long

Reading '../data/hin_corp_unicode/651_utf.txt'...
Corpus is now 10255136 characters long

Reading '../data/hin_corp_unicode/652_utf.txt'...
Corpus is now 10267009 characters long

Reading '../data/hin_corp_unicode/653_utf.txt'...
Corpus is now 10277074 characters long

Reading '../data/hin_corp_unicode/654_utf.txt'...
Corpus is now 10287102 characters long

Reading '../data/hin_corp_unicode/655_utf.txt'...
Corpus is now 10298868 characters long

Reading '../data/hin_corp_unicode/656_utf.txt'...
Corpus is now 10310945 characters long

Reading '../data/hin_corp_unicode/657_utf.txt'...
Corpus is now 10322230 characters long

Reading '../data/hin_corp_unicode/658_utf.txt'...
Corpus is now 10331965 characters long

Reading '../data/hin_corp_unicode/660_utf.txt'...
Corpus is now 10340741 characters long

Reading '../data/hin_corp_unicode/661_utf.txt'...
Corpus is now 10350321 characters long

Reading '../data/hin_corp_unicode/662_utf.txt'...
Corpus is now 10359306 characters long

Reading '../data/hin_corp_unicode/663_utf.txt'...
Corpus is now 10370501 characters long

Reading '../data/hin_corp_unicode/664_utf.txt'...
Corpus is now 10380708 characters long

Reading '../data/hin_corp_unicode/665_utf.txt'...
Corpus is now 10392480 characters long

Reading '../data/hin_corp_unicode/666_utf.txt'...
Corpus is now 10404267 characters long

Reading '../data/hin_corp_unicode/667_utf.txt'...
Corpus is now 10412494 characters long

Reading '../data/hin_corp_unicode/668_utf.txt'...
Corpus is now 10421900 characters long

Reading '../data/hin_corp_unicode/669_utf.txt'...
Corpus is now 10434585 characters long

Reading '../data/hin_corp_unicode/670_utf.txt'...
Corpus is now 10446139 characters long

Reading '../data/hin_corp_unicode/671_utf.txt'...
Corpus is now 10456215 characters long

Reading '../data/hin_corp_unicode/672_utf.txt'...
Corpus is now 10467899 characters long

Reading '../data/hin_corp_unicode/673_utf.txt'...
Corpus is now 10479130 characters long

Reading '../data/hin_corp_unicode/674_utf.txt'...
Corpus is now 10490598 characters long

Reading '../data/hin_corp_unicode/675_utf.txt'...
Corpus is now 10500049 characters long

Reading '../data/hin_corp_unicode/676_utf.txt'...
Corpus is now 10512812 characters long

Reading '../data/hin_corp_unicode/677_utf.txt'...
Corpus is now 10523419 characters long

Reading '../data/hin_corp_unicode/678_utf.txt'...
Corpus is now 10534874 characters long

Reading '../data/hin_corp_unicode/679_utf.txt'...
Corpus is now 10547559 characters long

Reading '../data/hin_corp_unicode/680_utf.txt'...
Corpus is now 10555297 characters long

Reading '../data/hin_corp_unicode/681_utf.txt'...
Corpus is now 10563801 characters long

Reading '../data/hin_corp_unicode/682_utf.txt'...
Corpus is now 10572900 characters long

Reading '../data/hin_corp_unicode/683_utf.txt'...
Corpus is now 10587579 characters long

Reading '../data/hin_corp_unicode/684_utf.txt'...
Corpus is now 10602284 characters long

Reading '../data/hin_corp_unicode/685_utf.txt'...
Corpus is now 10613147 characters long

Reading '../data/hin_corp_unicode/686_utf.txt'...
Corpus is now 10624427 characters long

Reading '../data/hin_corp_unicode/687_utf.txt'...
Corpus is now 10636115 characters long

Reading '../data/hin_corp_unicode/688_utf.txt'...
Corpus is now 10644562 characters long

Reading '../data/hin_corp_unicode/689_utf.txt'...
Corpus is now 10654681 characters long

Reading '../data/hin_corp_unicode/690_utf.txt'...
Corpus is now 10664241 characters long

Reading '../data/hin_corp_unicode/691_utf.txt'...
Corpus is now 10673595 characters long

Reading '../data/hin_corp_unicode/692_utf.txt'...
Corpus is now 10683870 characters long

Reading '../data/hin_corp_unicode/693_utf.txt'...
Corpus is now 10693841 characters long

Reading '../data/hin_corp_unicode/694_utf.txt'...
Corpus is now 10703998 characters long

Reading '../data/hin_corp_unicode/695_utf.txt'...
Corpus is now 10714222 characters long

Reading '../data/hin_corp_unicode/696_utf.txt'...
Corpus is now 10725473 characters long

Reading '../data/hin_corp_unicode/697_utf.txt'...
Corpus is now 10735806 characters long

Reading '../data/hin_corp_unicode/698_utf.txt'...
Corpus is now 10745474 characters long

Reading '../data/hin_corp_unicode/699_utf.txt'...
Corpus is now 10755453 characters long

Reading '../data/hin_corp_unicode/6_utf8.txt'...
Corpus is now 10755453 characters long

Reading '../data/hin_corp_unicode/700_utf.txt'...
Corpus is now 10767292 characters long

Reading '../data/hin_corp_unicode/701_utf.txt'...
Corpus is now 10776689 characters long

Reading '../data/hin_corp_unicode/702_utf.txt'...
Corpus is now 10784173 characters long

Reading '../data/hin_corp_unicode/703_utf.txt'...
Corpus is now 10794967 characters long

Reading '../data/hin_corp_unicode/704_utf.txt'...
Corpus is now 10805778 characters long

Reading '../data/hin_corp_unicode/705_utf.txt'...
Corpus is now 10815040 characters long

Reading '../data/hin_corp_unicode/706_utf.txt'...
Corpus is now 10833931 characters long

Reading '../data/hin_corp_unicode/707_utf.txt'...
Corpus is now 10840264 characters long

Reading '../data/hin_corp_unicode/708_utf.txt'...
Corpus is now 10852818 characters long

Reading '../data/hin_corp_unicode/709_utf.txt'...
Corpus is now 10864271 characters long

Reading '../data/hin_corp_unicode/70_utf.txt'...
Corpus is now 10871041 characters long

Reading '../data/hin_corp_unicode/710_utf.txt'...
Corpus is now 10878338 characters long

Reading '../data/hin_corp_unicode/711_utf.txt'...
Corpus is now 10888316 characters long

Reading '../data/hin_corp_unicode/712_utf.txt'...
Corpus is now 10896764 characters long

Reading '../data/hin_corp_unicode/713_utf.txt'...
Corpus is now 10910132 characters long

Reading '../data/hin_corp_unicode/714_utf.txt'...
Corpus is now 10915221 characters long

Reading '../data/hin_corp_unicode/715_utf.txt'...
Corpus is now 10926061 characters long

Reading '../data/hin_corp_unicode/716_utf.txt'...
Corpus is now 10940265 characters long

Reading '../data/hin_corp_unicode/717_utf.txt'...
Corpus is now 10950718 characters long

Reading '../data/hin_corp_unicode/718_utf.txt'...
Corpus is now 10963165 characters long

Reading '../data/hin_corp_unicode/719_utf.txt'...
Corpus is now 10977428 characters long

Reading '../data/hin_corp_unicode/71_utf.txt'...
Corpus is now 10988275 characters long

Reading '../data/hin_corp_unicode/720_utf.txt'...
Corpus is now 10995346 characters long

Reading '../data/hin_corp_unicode/721_utf.txt'...
Corpus is now 11005123 characters long

Reading '../data/hin_corp_unicode/722_utf.txt'...
Corpus is now 11015983 characters long

Reading '../data/hin_corp_unicode/723_utf.txt'...
Corpus is now 11026302 characters long

Reading '../data/hin_corp_unicode/724_utf.txt'...
Corpus is now 11036455 characters long

Reading '../data/hin_corp_unicode/725_utf.txt'...
Corpus is now 11046190 characters long

Reading '../data/hin_corp_unicode/726_utf.txt'...
Corpus is now 11055884 characters long

Reading '../data/hin_corp_unicode/727_utf.txt'...
Corpus is now 11066846 characters long

Reading '../data/hin_corp_unicode/728_utf.txt'...
Corpus is now 11077324 characters long

Reading '../data/hin_corp_unicode/729_utf.txt'...
Corpus is now 11086270 characters long

Reading '../data/hin_corp_unicode/72_utf.txt'...
Corpus is now 11098023 characters long

Reading '../data/hin_corp_unicode/730_utf.txt'...
Corpus is now 11108387 characters long

Reading '../data/hin_corp_unicode/731_utf.txt'...
Corpus is now 11118982 characters long

Reading '../data/hin_corp_unicode/732_utf.txt'...
Corpus is now 11130378 characters long

Reading '../data/hin_corp_unicode/733_utf.txt'...
Corpus is now 11139469 characters long

Reading '../data/hin_corp_unicode/734_utf.txt'...
Corpus is now 11148044 characters long

Reading '../data/hin_corp_unicode/735_utf.txt'...
Corpus is now 11158764 characters long

Reading '../data/hin_corp_unicode/736_utf.txt'...
Corpus is now 11166300 characters long

Reading '../data/hin_corp_unicode/737_utf.txt'...
Corpus is now 11174875 characters long

Reading '../data/hin_corp_unicode/738_utf.txt'...
Corpus is now 11185595 characters long

Reading '../data/hin_corp_unicode/739_utf.txt'...
Corpus is now 11193131 characters long

Reading '../data/hin_corp_unicode/73_utf.txt'...
Corpus is now 11203714 characters long

Reading '../data/hin_corp_unicode/740_utf.txt'...
Corpus is now 11218688 characters long

Reading '../data/hin_corp_unicode/741_utf.txt'...
Corpus is now 11234001 characters long

Reading '../data/hin_corp_unicode/742_utf.txt'...
Corpus is now 11245665 characters long

Reading '../data/hin_corp_unicode/743_utf.txt'...
Corpus is now 11258925 characters long

Reading '../data/hin_corp_unicode/744_utf.txt'...
Corpus is now 11272436 characters long

Reading '../data/hin_corp_unicode/745_utf.txt'...
Corpus is now 11285929 characters long

Reading '../data/hin_corp_unicode/746_utf.txt'...
Corpus is now 11299524 characters long

Reading '../data/hin_corp_unicode/747_utf.txt'...
Corpus is now 11312383 characters long

Reading '../data/hin_corp_unicode/748_utf.txt'...
Corpus is now 11327924 characters long

Reading '../data/hin_corp_unicode/749_utf.txt'...
Corpus is now 11342558 characters long

Reading '../data/hin_corp_unicode/74_utf.txt'...
Corpus is now 11354145 characters long

Reading '../data/hin_corp_unicode/750_utf.txt'...
Corpus is now 11365462 characters long

Reading '../data/hin_corp_unicode/751_utf.txt'...
Corpus is now 11382783 characters long

Reading '../data/hin_corp_unicode/752_utf.txt'...
Corpus is now 11396066 characters long

Reading '../data/hin_corp_unicode/753_utf.txt'...
Corpus is now 11409996 characters long

Reading '../data/hin_corp_unicode/754_utf.txt'...
Corpus is now 11422730 characters long

Reading '../data/hin_corp_unicode/755_utf.txt'...
Corpus is now 11435068 characters long

Reading '../data/hin_corp_unicode/756_utf.txt'...
Corpus is now 11450901 characters long

Reading '../data/hin_corp_unicode/757_utf.txt'...
Corpus is now 11472399 characters long

Reading '../data/hin_corp_unicode/758_utf.txt'...
Corpus is now 11484637 characters long

Reading '../data/hin_corp_unicode/75_utf.txt'...
Corpus is now 11494197 characters long

Reading '../data/hin_corp_unicode/760_utf.txt'...
Corpus is now 11507662 characters long

Reading '../data/hin_corp_unicode/761_utf.txt'...
Corpus is now 11522556 characters long

Reading '../data/hin_corp_unicode/762_utf.txt'...
Corpus is now 11535130 characters long

Reading '../data/hin_corp_unicode/763_utf.txt'...
Corpus is now 11550886 characters long

Reading '../data/hin_corp_unicode/764_utf.txt'...
Corpus is now 11567135 characters long

Reading '../data/hin_corp_unicode/765_utf.txt'...
Corpus is now 11583079 characters long

Reading '../data/hin_corp_unicode/766_utf.txt'...
Corpus is now 11596844 characters long

Reading '../data/hin_corp_unicode/767_utf.txt'...
Corpus is now 11611648 characters long

Reading '../data/hin_corp_unicode/768_utf.txt'...
Corpus is now 11628930 characters long

Reading '../data/hin_corp_unicode/769_utf.txt'...
Corpus is now 11634436 characters long

Reading '../data/hin_corp_unicode/76_utf.txt'...
Corpus is now 11645323 characters long

Reading '../data/hin_corp_unicode/770_utf.txt'...
Corpus is now 11658798 characters long

Reading '../data/hin_corp_unicode/771_utf.txt'...
Corpus is now 11674530 characters long

Reading '../data/hin_corp_unicode/772_utf.txt'...
Corpus is now 11687265 characters long

Reading '../data/hin_corp_unicode/773_utf.txt'...
Corpus is now 11700966 characters long

Reading '../data/hin_corp_unicode/774_utf.txt'...
Corpus is now 11714036 characters long

Reading '../data/hin_corp_unicode/775_utf.txt'...
Corpus is now 11727392 characters long

Reading '../data/hin_corp_unicode/776_utf.txt'...
Corpus is now 11740380 characters long

Reading '../data/hin_corp_unicode/777_utf.txt'...
Corpus is now 11757955 characters long

Reading '../data/hin_corp_unicode/778_utf.txt'...
Corpus is now 11769969 characters long

Reading '../data/hin_corp_unicode/779_utf.txt'...
Corpus is now 11775475 characters long

Reading '../data/hin_corp_unicode/77_utf.txt'...
Corpus is now 11786361 characters long

Reading '../data/hin_corp_unicode/780_utf.txt'...
Corpus is now 11802161 characters long

Reading '../data/hin_corp_unicode/781_utf.txt'...
Corpus is now 11817726 characters long

Reading '../data/hin_corp_unicode/782_utf.txt'...
Corpus is now 11837079 characters long

Reading '../data/hin_corp_unicode/783_utf.txt'...
Corpus is now 11848865 characters long

Reading '../data/hin_corp_unicode/784_utf.txt'...
Corpus is now 11865137 characters long

Reading '../data/hin_corp_unicode/785_utf.txt'...
Corpus is now 11878090 characters long

Reading '../data/hin_corp_unicode/786_utf.txt'...
Corpus is now 11891018 characters long

Reading '../data/hin_corp_unicode/787_utf.txt'...
Corpus is now 11917347 characters long

Reading '../data/hin_corp_unicode/788_utf.txt'...
Corpus is now 11928834 characters long

Reading '../data/hin_corp_unicode/789_utf.txt'...
Corpus is now 11942916 characters long

Reading '../data/hin_corp_unicode/78_utf.txt'...
Corpus is now 11952171 characters long

Reading '../data/hin_corp_unicode/790_utf.txt'...
Corpus is now 11967691 characters long

Reading '../data/hin_corp_unicode/791_utf.txt'...
Corpus is now 11980744 characters long

Reading '../data/hin_corp_unicode/792_utf.txt'...
Corpus is now 11993556 characters long

Reading '../data/hin_corp_unicode/793_utf.txt'...
Corpus is now 12008262 characters long

Reading '../data/hin_corp_unicode/794_utf.txt'...
Corpus is now 12021853 characters long

Reading '../data/hin_corp_unicode/795_utf.txt'...
Corpus is now 12035320 characters long

Reading '../data/hin_corp_unicode/796_utf.txt'...
Corpus is now 12051035 characters long

Reading '../data/hin_corp_unicode/797_utf.txt'...
Corpus is now 12065971 characters long

Reading '../data/hin_corp_unicode/798_utf.txt'...
Corpus is now 12078871 characters long

Reading '../data/hin_corp_unicode/799_utf.txt'...
Corpus is now 12093663 characters long

Reading '../data/hin_corp_unicode/79_utf.txt'...
Corpus is now 12105687 characters long

Reading '../data/hin_corp_unicode/7_utf8.txt'...
Corpus is now 12105687 characters long

Reading '../data/hin_corp_unicode/800_utf.txt'...
Corpus is now 12119381 characters long

Reading '../data/hin_corp_unicode/801_utf.txt'...
Corpus is now 12134767 characters long

Reading '../data/hin_corp_unicode/802_utf.txt'...
Corpus is now 12151520 characters long

Reading '../data/hin_corp_unicode/803_utf.txt'...
Corpus is now 12166176 characters long

Reading '../data/hin_corp_unicode/804_utf.txt'...
Corpus is now 12179776 characters long

Reading '../data/hin_corp_unicode/805_utf.txt'...
Corpus is now 12196729 characters long

Reading '../data/hin_corp_unicode/806_utf.txt'...
Corpus is now 12214104 characters long

Reading '../data/hin_corp_unicode/807_utf.txt'...
Corpus is now 12227821 characters long

Reading '../data/hin_corp_unicode/808_utf.txt'...
Corpus is now 12242348 characters long

Reading '../data/hin_corp_unicode/809_utf.txt'...
Corpus is now 12255545 characters long

Reading '../data/hin_corp_unicode/80_utf.txt'...
Corpus is now 12268916 characters long

Reading '../data/hin_corp_unicode/810_utf.txt'...
Corpus is now 12283026 characters long

Reading '../data/hin_corp_unicode/811_utf.txt'...
Corpus is now 12296520 characters long

Reading '../data/hin_corp_unicode/812_utf.txt'...
Corpus is now 12307587 characters long

Reading '../data/hin_corp_unicode/813_utf.txt'...
Corpus is now 12320805 characters long

Reading '../data/hin_corp_unicode/814_utf.txt'...
Corpus is now 12336656 characters long

Reading '../data/hin_corp_unicode/815_utf.txt'...
Corpus is now 12345154 characters long

Reading '../data/hin_corp_unicode/816_utf.txt'...
Corpus is now 12353201 characters long

Reading '../data/hin_corp_unicode/817_utf.txt'...
Corpus is now 12372928 characters long

Reading '../data/hin_corp_unicode/818_utf.txt'...
Corpus is now 12379167 characters long

Reading '../data/hin_corp_unicode/819_utf.txt'...
Corpus is now 12388006 characters long

Reading '../data/hin_corp_unicode/81_utf.txt'...
Corpus is now 12397316 characters long

Reading '../data/hin_corp_unicode/820_utf.txt'...
Corpus is now 12414191 characters long

Reading '../data/hin_corp_unicode/821_utf.txt'...
Corpus is now 12421280 characters long

Reading '../data/hin_corp_unicode/822_utf.txt'...
Corpus is now 12430393 characters long

Reading '../data/hin_corp_unicode/823_utf.txt'...
Corpus is now 12439506 characters long

Reading '../data/hin_corp_unicode/824_utf.txt'...
Corpus is now 12451944 characters long

Reading '../data/hin_corp_unicode/825_utf.txt'...
Corpus is now 12462393 characters long

Reading '../data/hin_corp_unicode/826_utf.txt'...
Corpus is now 12474503 characters long

Reading '../data/hin_corp_unicode/827_utf.txt'...
Corpus is now 12490700 characters long

Reading '../data/hin_corp_unicode/828_utf.txt'...
Corpus is now 12506058 characters long

Reading '../data/hin_corp_unicode/829_utf.txt'...
Corpus is now 12518199 characters long

Reading '../data/hin_corp_unicode/82_utf.txt'...
Corpus is now 12530014 characters long

Reading '../data/hin_corp_unicode/830_utf.txt'...
Corpus is now 12542118 characters long

Reading '../data/hin_corp_unicode/831_utf.txt'...
Corpus is now 12554677 characters long

Reading '../data/hin_corp_unicode/832_utf.txt'...
Corpus is now 12564654 characters long

Reading '../data/hin_corp_unicode/833_utf.txt'...
Corpus is now 12577332 characters long

Reading '../data/hin_corp_unicode/834_utf.txt'...
Corpus is now 12587062 characters long

Reading '../data/hin_corp_unicode/835_utf.txt'...
Corpus is now 12598874 characters long

Reading '../data/hin_corp_unicode/836_utf.txt'...
Corpus is now 12608597 characters long

Reading '../data/hin_corp_unicode/837_utf.txt'...
Corpus is now 12622538 characters long

Reading '../data/hin_corp_unicode/838_utf.txt'...
Corpus is now 12642945 characters long

Reading '../data/hin_corp_unicode/839_utf.txt'...
Corpus is now 12655994 characters long

Reading '../data/hin_corp_unicode/83_utf.txt'...
Corpus is now 12669233 characters long

Reading '../data/hin_corp_unicode/840_utf.txt'...
Corpus is now 12673941 characters long

Reading '../data/hin_corp_unicode/841_utf.txt'...
Corpus is now 12685790 characters long

Reading '../data/hin_corp_unicode/842_utf.txt'...
Corpus is now 12699759 characters long

Reading '../data/hin_corp_unicode/843_utf.txt'...
Corpus is now 12711200 characters long

Reading '../data/hin_corp_unicode/844_utf.txt'...
Corpus is now 12724008 characters long

Reading '../data/hin_corp_unicode/845_utf.txt'...
Corpus is now 12735228 characters long

Reading '../data/hin_corp_unicode/846_utf.txt'...
Corpus is now 12749624 characters long

Reading '../data/hin_corp_unicode/847_utf.txt'...
Corpus is now 12761403 characters long

Reading '../data/hin_corp_unicode/848_utf.txt'...
Corpus is now 12773618 characters long

Reading '../data/hin_corp_unicode/849_utf.txt'...
Corpus is now 12784239 characters long

Reading '../data/hin_corp_unicode/84_utf.txt'...
Corpus is now 12795135 characters long

Reading '../data/hin_corp_unicode/850_utf.txt'...
Corpus is now 12807055 characters long

Reading '../data/hin_corp_unicode/851_utf.txt'...
Corpus is now 12819870 characters long

Reading '../data/hin_corp_unicode/852_utf.txt'...
Corpus is now 12829972 characters long

Reading '../data/hin_corp_unicode/853_utf.txt'...
Corpus is now 12841817 characters long

Reading '../data/hin_corp_unicode/854_utf.txt'...
Corpus is now 12856310 characters long

Reading '../data/hin_corp_unicode/855_utf.txt'...
Corpus is now 12870238 characters long

Reading '../data/hin_corp_unicode/856_utf.txt'...
Corpus is now 12882681 characters long

Reading '../data/hin_corp_unicode/857_utf.txt'...
Corpus is now 12895620 characters long

Reading '../data/hin_corp_unicode/858_utf.txt'...
Corpus is now 12909206 characters long

Reading '../data/hin_corp_unicode/859_utf.txt'...
Corpus is now 12924180 characters long

Reading '../data/hin_corp_unicode/85_utf.txt'...
Corpus is now 12937795 characters long

Reading '../data/hin_corp_unicode/860_utf.txt'...
Corpus is now 12947251 characters long

Reading '../data/hin_corp_unicode/861_utf.txt'...
Corpus is now 12960607 characters long

Reading '../data/hin_corp_unicode/862_utf.txt'...
Corpus is now 12972621 characters long

Reading '../data/hin_corp_unicode/863_utf.txt'...
Corpus is now 12990196 characters long

Reading '../data/hin_corp_unicode/864_utf.txt'...
Corpus is now 13005997 characters long

Reading '../data/hin_corp_unicode/865_utf.txt'...
Corpus is now 13018951 characters long

Reading '../data/hin_corp_unicode/866_utf.txt'...
Corpus is now 13034516 characters long

Reading '../data/hin_corp_unicode/867_utf.txt'...
Corpus is now 13047444 characters long

Reading '../data/hin_corp_unicode/868_utf.txt'...
Corpus is now 13059611 characters long

Reading '../data/hin_corp_unicode/869_utf.txt'...
Corpus is now 13073305 characters long

Reading '../data/hin_corp_unicode/86_utf.txt'...
Corpus is now 13086265 characters long

Reading '../data/hin_corp_unicode/870_utf.txt'...
Corpus is now 13097752 characters long

Reading '../data/hin_corp_unicode/871_utf.txt'...
Corpus is now 13113272 characters long

Reading '../data/hin_corp_unicode/872_utf.txt'...
Corpus is now 13126325 characters long

Reading '../data/hin_corp_unicode/873_utf.txt'...
Corpus is now 13139137 characters long

Reading '../data/hin_corp_unicode/874_utf.txt'...
Corpus is now 13153844 characters long

Reading '../data/hin_corp_unicode/875_utf.txt'...
Corpus is now 13169559 characters long

Reading '../data/hin_corp_unicode/876_utf.txt'...
Corpus is now 13183026 characters long

Reading '../data/hin_corp_unicode/877_utf.txt'...
Corpus is now 13195926 characters long

Reading '../data/hin_corp_unicode/878_utf.txt'...
Corpus is now 13210718 characters long

Reading '../data/hin_corp_unicode/879_utf.txt'...
Corpus is now 13224412 characters long

Reading '../data/hin_corp_unicode/87_utf.txt'...
Corpus is now 13236690 characters long

Reading '../data/hin_corp_unicode/880_utf.txt'...
Corpus is now 13252076 characters long

Reading '../data/hin_corp_unicode/881_utf.txt'...
Corpus is now 13266732 characters long

Reading '../data/hin_corp_unicode/882_utf.txt'...
Corpus is now 13275230 characters long

Reading '../data/hin_corp_unicode/883_utf.txt'...
Corpus is now 13283480 characters long

Reading '../data/hin_corp_unicode/884_utf.txt'...
Corpus is now 13291370 characters long

Reading '../data/hin_corp_unicode/885_utf.txt'...
Corpus is now 13297609 characters long

Reading '../data/hin_corp_unicode/886_utf.txt'...
Corpus is now 13306448 characters long

Reading '../data/hin_corp_unicode/887_utf.txt'...
Corpus is now 13314676 characters long

Reading '../data/hin_corp_unicode/888_utf.txt'...
Corpus is now 13321765 characters long

Reading '../data/hin_corp_unicode/889_utf.txt'...
Corpus is now 13330878 characters long

Reading '../data/hin_corp_unicode/88_utf.txt'...
Corpus is now 13343327 characters long

Reading '../data/hin_corp_unicode/890_utf.txt'...
Corpus is now 13352394 characters long

Reading '../data/hin_corp_unicode/891_utf.txt'...
Corpus is now 13364504 characters long

Reading '../data/hin_corp_unicode/892_utf.txt'...
Corpus is now 13380701 characters long

Reading '../data/hin_corp_unicode/893_utf.txt'...
Corpus is now 13392859 characters long

Reading '../data/hin_corp_unicode/894_utf.txt'...
Corpus is now 13405418 characters long

Reading '../data/hin_corp_unicode/895_utf.txt'...
Corpus is now 13415395 characters long

Reading '../data/hin_corp_unicode/896_utf.txt'...
Corpus is now 13425371 characters long

Reading '../data/hin_corp_unicode/897_utf.txt'...
Corpus is now 13434827 characters long

Reading '../data/hin_corp_unicode/898_utf.txt'...
Corpus is now 13446371 characters long

Reading '../data/hin_corp_unicode/89_utf.txt'...
Corpus is now 13458189 characters long

Reading '../data/hin_corp_unicode/8_utf8.txt'...
Corpus is now 13458189 characters long

Reading '../data/hin_corp_unicode/900_utf.txt'...
Corpus is now 13470020 characters long

Reading '../data/hin_corp_unicode/901_utf.txt'...
Corpus is now 13479722 characters long

Reading '../data/hin_corp_unicode/902_utf.txt'...
Corpus is now 13490925 characters long

Reading '../data/hin_corp_unicode/903_utf.txt'...
Corpus is now 13502844 characters long

Reading '../data/hin_corp_unicode/904_utf.txt'...
Corpus is now 13513868 characters long

Reading '../data/hin_corp_unicode/905_utf.txt'...
Corpus is now 13525254 characters long

Reading '../data/hin_corp_unicode/906_utf.txt'...
Corpus is now 13537859 characters long

Reading '../data/hin_corp_unicode/907_utf.txt'...
Corpus is now 13549753 characters long

Reading '../data/hin_corp_unicode/908_utf.txt'...
Corpus is now 13560771 characters long

Reading '../data/hin_corp_unicode/909_utf.txt'...
Corpus is now 13571730 characters long

Reading '../data/hin_corp_unicode/90_utf.txt'...
Corpus is now 13584037 characters long

Reading '../data/hin_corp_unicode/910_utf.txt'...
Corpus is now 13596435 characters long

Reading '../data/hin_corp_unicode/911_utf.txt'...
Corpus is now 13607979 characters long

Reading '../data/hin_corp_unicode/912_utf.txt'...
Corpus is now 13622487 characters long

Reading '../data/hin_corp_unicode/913_utf.txt'...
Corpus is now 13634943 characters long

Reading '../data/hin_corp_unicode/914_utf.txt'...
Corpus is now 13649187 characters long

Reading '../data/hin_corp_unicode/915_utf.txt'...
Corpus is now 13664443 characters long

Reading '../data/hin_corp_unicode/916_utf.txt'...
Corpus is now 13676849 characters long

Reading '../data/hin_corp_unicode/917_utf.txt'...
Corpus is now 13692162 characters long

Reading '../data/hin_corp_unicode/918_utf.txt'...
Corpus is now 13712625 characters long

Reading '../data/hin_corp_unicode/919_utf.txt'...
Corpus is now 13723017 characters long

Reading '../data/hin_corp_unicode/91_utf.txt'...
Corpus is now 13738274 characters long

Reading '../data/hin_corp_unicode/920_utf.txt'...
Corpus is now 13751930 characters long

Reading '../data/hin_corp_unicode/921_utf.txt'...
Corpus is now 13766649 characters long

Reading '../data/hin_corp_unicode/922_utf.txt'...
Corpus is now 13778358 characters long

Reading '../data/hin_corp_unicode/923_utf.txt'...
Corpus is now 13791594 characters long

Reading '../data/hin_corp_unicode/924_utf.txt'...
Corpus is now 13805381 characters long

Reading '../data/hin_corp_unicode/925_utf.txt'...
Corpus is now 13825778 characters long

Reading '../data/hin_corp_unicode/926_utf.txt'...
Corpus is now 13835784 characters long

Reading '../data/hin_corp_unicode/927_utf.txt'...
Corpus is now 13850183 characters long

Reading '../data/hin_corp_unicode/928_utf.txt'...
Corpus is now 13861206 characters long

Reading '../data/hin_corp_unicode/929_utf.txt'...
Corpus is now 13872528 characters long

Reading '../data/hin_corp_unicode/92_utf.txt'...
Corpus is now 13886639 characters long

Reading '../data/hin_corp_unicode/930_utf.txt'...
Corpus is now 13900966 characters long

Reading '../data/hin_corp_unicode/931_utf.txt'...
Corpus is now 13913022 characters long

Reading '../data/hin_corp_unicode/932_utf.txt'...
Corpus is now 13926545 characters long

Reading '../data/hin_corp_unicode/933_utf.txt'...
Corpus is now 13936506 characters long

Reading '../data/hin_corp_unicode/934_utf.txt'...
Corpus is now 13951231 characters long

Reading '../data/hin_corp_unicode/935_utf.txt'...
Corpus is now 13965300 characters long

Reading '../data/hin_corp_unicode/936_utf.txt'...
Corpus is now 13976427 characters long

Reading '../data/hin_corp_unicode/937_utf.txt'...
Corpus is now 13994126 characters long

Reading '../data/hin_corp_unicode/938_utf.txt'...
Corpus is now 14007623 characters long

Reading '../data/hin_corp_unicode/939_utf.txt'...
Corpus is now 14021720 characters long

Reading '../data/hin_corp_unicode/93_utf.txt'...
Corpus is now 14032675 characters long

Reading '../data/hin_corp_unicode/940_utf.txt'...
Corpus is now 14046509 characters long

Reading '../data/hin_corp_unicode/941_utf.txt'...
Corpus is now 14061198 characters long

Reading '../data/hin_corp_unicode/942_utf.txt'...
Corpus is now 14078415 characters long

Reading '../data/hin_corp_unicode/943_utf.txt'...
Corpus is now 14089263 characters long

Reading '../data/hin_corp_unicode/944_utf.txt'...
Corpus is now 14109693 characters long

Reading '../data/hin_corp_unicode/945_utf.txt'...
Corpus is now 14130998 characters long

Reading '../data/hin_corp_unicode/946_utf.txt'...
Corpus is now 14143492 characters long

Reading '../data/hin_corp_unicode/947_utf.txt'...
Corpus is now 14160038 characters long

Reading '../data/hin_corp_unicode/948_utf.txt'...
Corpus is now 14172985 characters long

Reading '../data/hin_corp_unicode/949_utf.txt'...
Corpus is now 14187279 characters long

Reading '../data/hin_corp_unicode/94_utf.txt'...
Corpus is now 14200265 characters long

Reading '../data/hin_corp_unicode/950_utf.txt'...
Corpus is now 14212501 characters long

Reading '../data/hin_corp_unicode/951_utf.txt'...
Corpus is now 14225940 characters long

Reading '../data/hin_corp_unicode/952_utf.txt'...
Corpus is now 14240286 characters long

Reading '../data/hin_corp_unicode/953_utf.txt'...
Corpus is now 14256915 characters long

Reading '../data/hin_corp_unicode/954_utf.txt'...
Corpus is now 14270685 characters long

Reading '../data/hin_corp_unicode/955_utf.txt'...
Corpus is now 14287032 characters long

Reading '../data/hin_corp_unicode/956_utf.txt'...
Corpus is now 14300289 characters long

Reading '../data/hin_corp_unicode/957_utf.txt'...
Corpus is now 14314940 characters long

Reading '../data/hin_corp_unicode/958_utf.txt'...
Corpus is now 14328271 characters long

Reading '../data/hin_corp_unicode/959_utf.txt'...
Corpus is now 14354577 characters long

Reading '../data/hin_corp_unicode/95_utf.txt'...
Corpus is now 14365581 characters long

Reading '../data/hin_corp_unicode/960_utf.txt'...
Corpus is now 14379365 characters long

Reading '../data/hin_corp_unicode/961_utf.txt'...
Corpus is now 14392768 characters long

Reading '../data/hin_corp_unicode/962_utf.txt'...
Corpus is now 14405859 characters long

Reading '../data/hin_corp_unicode/963_utf.txt'...
Corpus is now 14419954 characters long

Reading '../data/hin_corp_unicode/964_utf.txt'...
Corpus is now 14431329 characters long

Reading '../data/hin_corp_unicode/965_utf.txt'...
Corpus is now 14444094 characters long

Reading '../data/hin_corp_unicode/966_utf.txt'...
Corpus is now 14470019 characters long

Reading '../data/hin_corp_unicode/967_utf.txt'...
Corpus is now 14482150 characters long

Reading '../data/hin_corp_unicode/968_utf.txt'...
Corpus is now 14494282 characters long

Reading '../data/hin_corp_unicode/969_utf.txt'...
Corpus is now 14507627 characters long

Reading '../data/hin_corp_unicode/96_utf.txt'...
Corpus is now 14515777 characters long

Reading '../data/hin_corp_unicode/970_utf.txt'...
Corpus is now 14530285 characters long

Reading '../data/hin_corp_unicode/971_utf.txt'...
Corpus is now 14543879 characters long

Reading '../data/hin_corp_unicode/972_utf.txt'...
Corpus is now 14557111 characters long

Reading '../data/hin_corp_unicode/973_utf.txt'...
Corpus is now 14575834 characters long

Reading '../data/hin_corp_unicode/974_utf.txt'...
Corpus is now 14598645 characters long

Reading '../data/hin_corp_unicode/975_utf.txt'...
Corpus is now 14609721 characters long

Reading '../data/hin_corp_unicode/976_utf.txt'...
Corpus is now 14624215 characters long

Reading '../data/hin_corp_unicode/977_utf.txt'...
Corpus is now 14635966 characters long

Reading '../data/hin_corp_unicode/978_utf.txt'...
Corpus is now 14647147 characters long

Reading '../data/hin_corp_unicode/979_utf.txt'...
Corpus is now 14660492 characters long

Reading '../data/hin_corp_unicode/97_utf.txt'...
Corpus is now 14671425 characters long

Reading '../data/hin_corp_unicode/980_utf.txt'...
Corpus is now 14684640 characters long

Reading '../data/hin_corp_unicode/981_utf.txt'...
Corpus is now 14698733 characters long

Reading '../data/hin_corp_unicode/982_utf.txt'...
Corpus is now 14725882 characters long

Reading '../data/hin_corp_unicode/983_utf.txt'...
Corpus is now 14740731 characters long

Reading '../data/hin_corp_unicode/984_utf.txt'...
Corpus is now 14753392 characters long

Reading '../data/hin_corp_unicode/985_utf.txt'...
Corpus is now 14765019 characters long

Reading '../data/hin_corp_unicode/986_utf.txt'...
Corpus is now 14780342 characters long

Reading '../data/hin_corp_unicode/987_utf.txt'...
Corpus is now 14792006 characters long

Reading '../data/hin_corp_unicode/988_utf.txt'...
Corpus is now 14803765 characters long

Reading '../data/hin_corp_unicode/989_utf.txt'...
Corpus is now 14813858 characters long

Reading '../data/hin_corp_unicode/98_utf.txt'...
Corpus is now 14830705 characters long

Reading '../data/hin_corp_unicode/990_utf.txt'...
Corpus is now 14843384 characters long

Reading '../data/hin_corp_unicode/991_utf.txt'...
Corpus is now 14856712 characters long

Reading '../data/hin_corp_unicode/992_utf.txt'...
Corpus is now 14871614 characters long

Reading '../data/hin_corp_unicode/993_utf.txt'...
Corpus is now 14883496 characters long

Reading '../data/hin_corp_unicode/994_utf.txt'...
Corpus is now 14898129 characters long

Reading '../data/hin_corp_unicode/995_utf.txt'...
Corpus is now 14915303 characters long

Reading '../data/hin_corp_unicode/996_utf.txt'...
Corpus is now 14932263 characters long

Reading '../data/hin_corp_unicode/997_utf.txt'...
Corpus is now 14944914 characters long

Reading '../data/hin_corp_unicode/998_utf.txt'...
Corpus is now 14961333 characters long

Reading '../data/hin_corp_unicode/999_utf.txt'...
Corpus is now 14977613 characters long

Reading '../data/hin_corp_unicode/99_utf.txt'...
Corpus is now 14990762 characters long

Reading '../data/hin_corp_unicode/9_utf8.txt'...
Corpus is now 14990762 characters long


In [8]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [10]:
def sentence_to_wordlist(raw):
    clean = re.sub("[.\r\n]"," ", raw)
    words = clean.split()
    return words

sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [11]:
token_count = sum([len(sentence) for sentence in sentences])
print("The Hindi corpus contains {0:,} tokens".format(token_count))


The Hindi corpus contains 2,840,476 tokens

In [12]:
sentences[0]


Out[12]:
[u'\u0915\u093f\u0938\u094d\u092e\u094b\u0902',
 u'\u0915\u0947',
 u'\u0935\u093f\u0915\u093e\u0938',
 u'\u092e\u0947\u0902',
 u'\u0938\u0902\u0915\u0940\u0930\u094d\u0923',
 u'\u091c\u0940\u0928',
 u'\u0906\u0927\u093e\u0930\u094b\u0902',
 u'\u0915\u0947',
 u'\u092c\u095d\u0924\u0947',
 u'\u0909\u092a\u092f\u094b\u0917',
 u'\u0938\u0947',
 u'\u090f\u0915',
 u'\u0914\u0930',
 u'\u0939\u093e\u0928\u093f',
 u'\u0939\u0941\u0908']

Word Vectors


In [13]:
# Dimensionality of the resulting word vectors.
# More dimensions = more generalized
num_features = 50
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_threads = multiprocessing.cpu_count()

# Context window length.
context_size = 8

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
# Random Number Generator
seed = 1

In [14]:
# Defining the model
model = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_threads,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [15]:
model.build_vocab(sentences)

In [16]:
model.train(sentences)


Out[16]:
10730520

In [17]:
# Save our model
model.save(os.path.join("../data/", "hindi_word2Vec_small.w2v"))

Explore the model


In [ ]:
trained_model = w2v.Word2Vec.load(os.path.join("../data/", "hindi_word2Vec_small.w2v"))

In [ ]:
# For reducing dimensiomns, to visualize vectors
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = trained_model.syn1neg[:200] # Currently giving memory error for all words
# Reduced dimensions
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [ ]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[trained_model.wv.vocab[word].index])
            for word in trained_model.wv.vocab
            if trained_model.wv.vocab[word].index < 200
        ]
    ],
    columns=["word", "x", "y"]
)

In [ ]:
s = trained_model.wv[u"आधार"]

In [ ]: